[gcc-6] 186/401: * Update the Linaro support to the 6-2016.07 snapshot.

Wed Apr 5 15:49:04 UTC 2017

This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch pu/reproducible_builds
in repository gcc-6.

commit c3769129f2252e631b8b07ab4f5ca25e52695404
Author: doko <doko at 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca>
Date:   Wed Aug 10 13:43:12 2016 +0000

      * Update the Linaro support to the 6-2016.07 snapshot.
    
    
    git-svn-id: svn://anonscm.debian.org/gcccvs/branches/sid/gcc-6@8948 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca
---
 debian/changelog                         |    1 +
 debian/patches/gcc-linaro-doc.diff       |  189 +-
 debian/patches/gcc-linaro-no-macros.diff |   14 +-
 debian/patches/gcc-linaro.diff           | 5584 +++++++++++++++++++++++++++++-
 debian/rules.defs                        |    2 -
 debian/rules.patch                       |    3 +-
 6 files changed, 5780 insertions(+), 13 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 682c6fd..841d5aa 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -7,6 +7,7 @@ gcc-6 (6.1.1-12) UNRELEASED; urgency=medium
 
   * Fix running the libjava testsuite.
   * Revert fix for PR target/55947, causing PR libstdc++/72813. LP: #1610220.
+  * Update the Linaro support to the 6-2016.07 snapshot.
 
   [ Aurelien Jarno ]
   * Replace proposed fix for PR ipa/68273 by the corresponding patch taken
diff --git a/debian/patches/gcc-linaro-doc.diff b/debian/patches/gcc-linaro-doc.diff
index 68b2b5f..2909dd6 100644
--- a/debian/patches/gcc-linaro-doc.diff
+++ b/debian/patches/gcc-linaro-doc.diff
@@ -1,2 +1,189 @@
-# DP: Changes for the Linaro 5-2016.xx release (documentation).
+# DP: Changes for the Linaro 6-2016.07 release (documentation).
 
+--- a/src/gcc/doc/cpp.texi
++++ b/src/gcc/doc/cpp.texi
+@@ -1984,7 +1984,7 @@ by GCC, or a non-GCC compiler that claims to accept the GNU C dialects,
+ you can simply test @code{__GNUC__}.  If you need to write code
+ which depends on a specific version, you must be more careful.  Each
+ time the minor version is increased, the patch level is reset to zero;
+-each time the major version is increased (which happens rarely), the
++each time the major version is increased, the
+ minor version and patch level are reset.  If you wish to use the
+ predefined macros directly in the conditional, you will need to write it
+ like this:
+--- a/src/gcc/doc/invoke.texi
++++ b/src/gcc/doc/invoke.texi
+@@ -9478,6 +9478,11 @@ Size of minimal partition for WHOPR (in estimated instructions).
+ This prevents expenses of splitting very small programs into too many
+ partitions.
+ 
++ at item lto-max-partition
++Size of max partition for WHOPR (in estimated instructions).
++to provide an upper bound for individual size of partition.
++Meant to be used only with balanced partitioning.
++
+ @item cxx-max-namespaces-for-diagnostic-help
+ The maximum number of namespaces to consult for suggestions when C++
+ name lookup fails for an identifier.  The default is 1000.
+@@ -12828,9 +12833,9 @@ These options are defined for AArch64 implementations:
+ @item -mabi=@var{name}
+ @opindex mabi
+ Generate code for the specified data model.  Permissible values
+-are @samp{ilp32} for SysV-like data model where int, long int and pointer
+-are 32-bit, and @samp{lp64} for SysV-like data model where int is 32-bit,
+-but long int and pointer are 64-bit.
++are @samp{ilp32} for SysV-like data model where int, long int and pointers
++are 32 bits, and @samp{lp64} for SysV-like data model where int is 32 bits,
++but long int and pointers are 64 bits.
+ 
+ The default depends on the specific target configuration.  Note that
+ the LP64 and ILP32 ABIs are not link-compatible; you must compile your
+@@ -12855,25 +12860,24 @@ Generate little-endian code.  This is the default when GCC is configured for an
+ @item -mcmodel=tiny
+ @opindex mcmodel=tiny
+ Generate code for the tiny code model.  The program and its statically defined
+-symbols must be within 1GB of each other.  Pointers are 64 bits.  Programs can
+-be statically or dynamically linked.  This model is not fully implemented and
+-mostly treated as @samp{small}.
++symbols must be within 1MB of each other.  Programs can be statically or
++dynamically linked.
+ 
+ @item -mcmodel=small
+ @opindex mcmodel=small
+ Generate code for the small code model.  The program and its statically defined
+-symbols must be within 4GB of each other.  Pointers are 64 bits.  Programs can
+-be statically or dynamically linked.  This is the default code model.
++symbols must be within 4GB of each other.  Programs can be statically or
++dynamically linked.  This is the default code model.
+ 
+ @item -mcmodel=large
+ @opindex mcmodel=large
+ Generate code for the large code model.  This makes no assumptions about
+-addresses and sizes of sections.  Pointers are 64 bits.  Programs can be
+-statically linked only.
++addresses and sizes of sections.  Programs can be statically linked only.
+ 
+ @item -mstrict-align
+ @opindex mstrict-align
+-Do not assume that unaligned memory references are handled by the system.
++Avoid generating memory accesses that may not be aligned on a natural object
++boundary as described in the architecture specification.
+ 
+ @item -momit-leaf-frame-pointer
+ @itemx -mno-omit-leaf-frame-pointer
+@@ -12895,7 +12899,7 @@ of TLS variables.
+ @item -mtls-size=@var{size}
+ @opindex mtls-size
+ Specify bit size of immediate TLS offsets.  Valid values are 12, 24, 32, 48.
+-This option depends on binutils higher than 2.25.
++This option requires binutils 2.26 or newer.
+ 
+ @item -mfix-cortex-a53-835769
+ @itemx -mno-fix-cortex-a53-835769
+@@ -12915,12 +12919,13 @@ corresponding flag to the linker.
+ 
+ @item -mlow-precision-recip-sqrt
+ @item -mno-low-precision-recip-sqrt
+- at opindex -mlow-precision-recip-sqrt
+- at opindex -mno-low-precision-recip-sqrt
+-When calculating the reciprocal square root approximation,
+-uses one less step than otherwise, thus reducing latency and precision.
+-This is only relevant if @option{-ffast-math} enables the reciprocal square root
+-approximation, which in turn depends on the target processor.
++ at opindex mlow-precision-recip-sqrt
++ at opindex mno-low-precision-recip-sqrt
++Enable or disable reciprocal square root approximation.
++This option only has an effect if @option{-ffast-math} or
++ at option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
++precision of reciprocal square root results to about 16 bits for
++single precision and to 32 bits for double precision.
+ 
+ @item -march=@var{name}
+ @opindex march
+@@ -12957,17 +12962,15 @@ Specify the name of the target processor for which GCC should tune the
+ performance of the code.  Permissible values for this option are:
+ @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a57},
+ @samp{cortex-a72}, @samp{exynos-m1}, @samp{qdf24xx}, @samp{thunderx},
+- at samp{xgene1}.
++ at samp{xgene1}, @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
++ at samp{native}.
+ 
+-Additionally, this option can specify that GCC should tune the performance
+-of the code for a big.LITTLE system.  Permissible values for this
+-option are: @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}.
++The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}
++specify that GCC should tune for a big.LITTLE system.
+ 
+ Additionally on native AArch64 GNU/Linux systems the value
+- at samp{native} is available.  This option causes the compiler to pick
+-the architecture of and tune the performance of the code for the
+-processor of the host system.  This option has no effect if the
+-compiler is unable to recognize the architecture of the host system.
++ at samp{native} tunes performance to the host system.  This option has no effect
++if the compiler is unable to recognize the processor of the host system.
+ 
+ Where none of @option{-mtune=}, @option{-mcpu=} or @option{-march=}
+ are specified, the code is tuned to perform well across a range
+@@ -12987,12 +12990,6 @@ documented in the sub-section on
+ Feature Modifiers}.  Where conflicting feature modifiers are
+ specified, the right-most feature is used.
+ 
+-Additionally on native AArch64 GNU/Linux systems the value
+- at samp{native} is available.  This option causes the compiler to tune
+-the performance of the code for the processor of the host system.
+-This option has no effect if the compiler is unable to recognize the
+-architecture of the host system.
+-
+ GCC uses @var{name} to determine what kind of instructions it can emit when
+ generating assembly code (as if by @option{-march}) and to determine
+ the target processor for which to tune for performance (as if
+@@ -13010,11 +13007,11 @@ across releases.
+ This option is only intended to be useful when developing GCC.
+ 
+ @item -mpc-relative-literal-loads
+- at opindex mpcrelativeliteralloads
+-Enable PC relative literal loads. If this option is used, literal
+-pools are assumed to have a range of up to 1MiB and an appropriate
+-instruction sequence is used. This option has no impact when used
+-with @option{-mcmodel=tiny}.
++ at opindex mpc-relative-literal-loads
++Enable PC-relative literal loads.  With this option literal pools are
++accessed using a single instruction and emitted after each function.  This
++limits the maximum size of functions to 1MB.  This is enabled by default for
++ at option{-mcmodel=tiny}.
+ 
+ @end table
+ 
+@@ -13045,9 +13042,9 @@ Enable Large System Extension instructions.  This is on by default for
+ 
+ @end table
+ 
+-That is, @option{crypto} implies @option{simd} implies @option{fp}.
+-Conversely, @option{nofp} (or equivalently, @option{-mgeneral-regs-only})
+-implies @option{nosimd} implies @option{nocrypto}.
++Feature @option{crypto} implies @option{simd}, which implies @option{fp}.
++Conversely, @option{nofp} implies @option{nosimd}, which implies
++ at option{nocrypto}.
+ 
+ @node Adapteva Epiphany Options
+ @subsection Adapteva Epiphany Options
+@@ -18082,7 +18079,7 @@ IEEE 754 floating-point data.
+ 
+ The @option{-mnan=legacy} option selects the legacy encoding.  In this
+ case quiet NaNs (qNaNs) are denoted by the first bit of their trailing
+-significand field being 0, whereas signalling NaNs (sNaNs) are denoted
++significand field being 0, whereas signaling NaNs (sNaNs) are denoted
+ by the first bit of their trailing significand field being 1.
+ 
+ The @option{-mnan=2008} option selects the IEEE 754-2008 encoding.  In
+--- a/src/gcc/doc/md.texi
++++ b/src/gcc/doc/md.texi
+@@ -5027,7 +5027,7 @@ it is unspecified which of the two operands is returned as the result.
+ IEEE-conformant minimum and maximum operations.  If one operand is a quiet
+ @code{NaN}, then the other operand is returned.  If both operands are quiet
+ @code{NaN}, then a quiet @code{NaN} is returned.  In the case when gcc supports
+-signalling @code{NaN} (-fsignaling-nans) an invalid floating point exception is
++signaling @code{NaN} (-fsignaling-nans) an invalid floating point exception is
+ raised and a quiet @code{NaN} is returned.
+ 
+ All operands have mode @var{m}, which is a scalar or vector
diff --git a/debian/patches/gcc-linaro-no-macros.diff b/debian/patches/gcc-linaro-no-macros.diff
index c4b8670..df3d913 100644
--- a/debian/patches/gcc-linaro-no-macros.diff
+++ b/debian/patches/gcc-linaro-no-macros.diff
@@ -4,7 +4,7 @@ Index: b/src/gcc/cppbuiltin.c
 ===================================================================
 --- a/src/gcc/cppbuiltin.c
 +++ b/src/gcc/cppbuiltin.c
-@@ -62,41 +62,18 @@ parse_basever (int *major, int *minor, i
+@@ -52,41 +52,18 @@ parse_basever (int *major, int *minor, i
      *patchlevel = s_patchlevel;
  }
  
@@ -51,7 +51,7 @@ Index: b/src/gcc/Makefile.in
 ===================================================================
 --- a/src/gcc/Makefile.in
 +++ b/src/gcc/Makefile.in
-@@ -810,12 +810,10 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x
+@@ -832,12 +832,10 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x
  DEVPHASE    := $(srcdir)/DEV-PHASE # experimental, prerelease, ""
  DATESTAMP   := $(srcdir)/DATESTAMP # YYYYMMDD or empty
  REVISION    := $(srcdir)/REVISION  # [BRANCH revision XXXXXX]
@@ -64,7 +64,7 @@ Index: b/src/gcc/Makefile.in
  
  ifeq (,$(wildcard $(REVISION)))
  REVISION_c  :=
-@@ -842,7 +840,6 @@ DATESTAMP_s := \
+@@ -864,7 +862,6 @@ DATESTAMP_s := \
    "\"$(if $(DEVPHASE_c)$(filter-out 0,$(PATCHLEVEL_c)), $(DATESTAMP_c))\""
  PKGVERSION_s:= "\"@PKGVERSION@\""
  BUGURL_s    := "\"@REPORT_BUGS_TO@\""
@@ -72,7 +72,7 @@ Index: b/src/gcc/Makefile.in
  
  PKGVERSION  := @PKGVERSION@
  BUGURL_TEXI := @REPORT_BUGS_TEXI@
-@@ -2622,9 +2619,8 @@ PREPROCESSOR_DEFINES = \
+@@ -2704,9 +2701,8 @@ PREPROCESSOR_DEFINES = \
    -DSTANDARD_EXEC_PREFIX=\"$(libdir)/gcc/\" \
    @TARGET_SYSTEM_ROOT_DEFINE@
  
@@ -88,13 +88,13 @@ Index: b/src/gcc/LINARO-VERSION
 ===================================================================
 --- a/src/gcc/LINARO-VERSION
 +++ /dev/null
-@@ -1,1 +0,0 @@
--5.2-2015.11~dev
+@@ -1 +0,0 @@
+-Snapshot 6.1-2016.07
 Index: b/src/gcc/configure.ac
 ===================================================================
 --- a/src/gcc/configure.ac
 +++ b/src/gcc/configure.ac
-@@ -862,7 +862,7 @@ AC_ARG_WITH(specs,
+@@ -903,7 +903,7 @@ AC_ARG_WITH(specs,
  )
  AC_SUBST(CONFIGURE_SPECS)
  
diff --git a/debian/patches/gcc-linaro.diff b/debian/patches/gcc-linaro.diff
index 7d54b2c..2ad91f2 100644
--- a/debian/patches/gcc-linaro.diff
+++ b/debian/patches/gcc-linaro.diff
@@ -1,7 +1,5587 @@
-# DP: Changes for the Linaro 5-2016.xx release.
+# DP: Changes for the Linaro 6-2016.07 release.
 
-LANG=C git diff 3e5774f831e9eca881babb16108038af1d444690 4c4566ae0c0962f52ccb4270c9c111dd17c6ac1a \
+MSG=$(git log origin/linaro/gcc-6-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-6-branch --format=format:"%H" -n 1 --grep "gcc-6-branch@${SVN%.}"
+
+LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b8175015c74b948ff1e32197 \
  | egrep -v '^(diff|index) ' \
  | filterdiff --strip=1 --addoldprefix=a/src/  --addnewprefix=b/src/ \
  | sed 's,a/src//dev/null,/dev/null,'
 
+--- /dev/null
++++ b/src/gcc/LINARO-VERSION
+@@ -0,0 +1 @@
++Snapshot 6.1-2016.07
+--- a/src/gcc/Makefile.in
++++ b/src/gcc/Makefile.in
+@@ -832,10 +832,12 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x.y
+ DEVPHASE    := $(srcdir)/DEV-PHASE # experimental, prerelease, ""
+ DATESTAMP   := $(srcdir)/DATESTAMP # YYYYMMDD or empty
+ REVISION    := $(srcdir)/REVISION  # [BRANCH revision XXXXXX]
++LINAROVER   := $(srcdir)/LINARO-VERSION # M.x-YYYY.MM[-S][~dev]
+ 
+ BASEVER_c   := $(shell cat $(BASEVER))
+ DEVPHASE_c  := $(shell cat $(DEVPHASE))
+ DATESTAMP_c := $(shell cat $(DATESTAMP))
++LINAROVER_c := $(shell cat $(LINAROVER))
+ 
+ ifeq (,$(wildcard $(REVISION)))
+ REVISION_c  :=
+@@ -862,6 +864,7 @@ DATESTAMP_s := \
+   "\"$(if $(DEVPHASE_c)$(filter-out 0,$(PATCHLEVEL_c)), $(DATESTAMP_c))\""
+ PKGVERSION_s:= "\"@PKGVERSION@\""
+ BUGURL_s    := "\"@REPORT_BUGS_TO@\""
++LINAROVER_s := "\"$(LINAROVER_c)\""
+ 
+ PKGVERSION  := @PKGVERSION@
+ BUGURL_TEXI := @REPORT_BUGS_TEXI@
+@@ -2701,8 +2704,9 @@ PREPROCESSOR_DEFINES = \
+   -DSTANDARD_EXEC_PREFIX=\"$(libdir)/gcc/\" \
+   @TARGET_SYSTEM_ROOT_DEFINE@
+ 
+-CFLAGS-cppbuiltin.o += $(PREPROCESSOR_DEFINES) -DBASEVER=$(BASEVER_s)
+-cppbuiltin.o: $(BASEVER)
++CFLAGS-cppbuiltin.o += $(PREPROCESSOR_DEFINES) -DBASEVER=$(BASEVER_s) \
++	-DLINAROVER=$(LINAROVER_s)
++cppbuiltin.o: $(BASEVER) $(LINAROVER)
+ 
+ CFLAGS-cppdefault.o += $(PREPROCESSOR_DEFINES)
+ 
+--- a/src/gcc/config.gcc
++++ b/src/gcc/config.gcc
+@@ -3795,38 +3795,40 @@ case "${target}" in
+ 		# Add extra multilibs
+ 		if test "x$with_multilib_list" != x; then
+ 			arm_multilibs=`echo $with_multilib_list | sed -e 's/,/ /g'`
+-			for arm_multilib in ${arm_multilibs}; do
+-				case ${arm_multilib} in
+-				aprofile)
++			case ${arm_multilibs} in
++			aprofile)
+ 				# Note that arm/t-aprofile is a
+ 				# stand-alone make file fragment to be
+ 				# used only with itself.  We do not
+ 				# specifically use the
+ 				# TM_MULTILIB_OPTION framework because
+ 				# this shorthand is more
+-				# pragmatic. Additionally it is only
+-				# designed to work without any
+-				# with-cpu, with-arch with-mode
++				# pragmatic.
++				tmake_profile_file="arm/t-aprofile"
++				;;
++			default)
++				;;
++			*)
++				echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
++				exit 1
++				;;
++			esac
++
++			if test "x${tmake_profile_file}" != x ; then
++				# arm/t-aprofile is only designed to work
++				# without any with-cpu, with-arch, with-mode,
+ 				# with-fpu or with-float options.
+-					if test "x$with_arch" != x \
+-					    || test "x$with_cpu" != x \
+-					    || test "x$with_float" != x \
+-					    || test "x$with_fpu" != x \
+-					    || test "x$with_mode" != x ; then
+-					    echo "Error: You cannot use any of --with-arch/cpu/fpu/float/mode with --with-multilib-list=aprofile" 1>&2
+-					    exit 1
+-					fi
+-					tmake_file="${tmake_file} arm/t-aprofile"
+-					break
+-					;;
+-				default)
+-					;;
+-				*)
+-					echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
+-					exit 1
+-					;;
+-				esac
+-			done
++				if test "x$with_arch" != x \
++				    || test "x$with_cpu" != x \
++				    || test "x$with_float" != x \
++				    || test "x$with_fpu" != x \
++				    || test "x$with_mode" != x ; then
++				    echo "Error: You cannot use any of --with-arch/cpu/fpu/float/mode with --with-multilib-list=${with_multilib_list}" 1>&2
++				    exit 1
++				fi
++
++				tmake_file="${tmake_file} ${tmake_profile_file}"
++			fi
+ 		fi
+ 		;;
+ 
+--- a/src/gcc/config/aarch64/aarch64-elf.h
++++ b/src/gcc/config/aarch64/aarch64-elf.h
+@@ -25,15 +25,6 @@
+ #define ASM_OUTPUT_LABELREF(FILE, NAME) \
+   aarch64_asm_output_labelref (FILE, NAME)
+ 
+-#define ASM_OUTPUT_DEF(FILE, NAME1, NAME2)	\
+-  do						\
+-    {						\
+-      assemble_name (FILE, NAME1);		\
+-      fputs (" = ", FILE);			\
+-      assemble_name (FILE, NAME2);		\
+-      fputc ('\n', FILE);			\
+-    } while (0)
+-
+ #define TEXT_SECTION_ASM_OP	"\t.text"
+ #define DATA_SECTION_ASM_OP	"\t.data"
+ #define BSS_SECTION_ASM_OP	"\t.bss"
+--- a/src/gcc/config/aarch64/aarch64-modes.def
++++ b/src/gcc/config/aarch64/aarch64-modes.def
+@@ -21,8 +21,6 @@
+ CC_MODE (CCFP);
+ CC_MODE (CCFPE);
+ CC_MODE (CC_SWP);
+-CC_MODE (CC_ZESWP); /* zero-extend LHS (but swap to make it RHS).  */
+-CC_MODE (CC_SESWP); /* sign-extend LHS (but swap to make it RHS).  */
+ CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
+ CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
+ CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
+--- a/src/gcc/config/aarch64/aarch64-protos.h
++++ b/src/gcc/config/aarch64/aarch64-protos.h
+@@ -290,6 +290,7 @@ bool aarch64_constant_address_p (rtx);
+ bool aarch64_expand_movmem (rtx *);
+ bool aarch64_float_const_zero_rtx_p (rtx);
+ bool aarch64_function_arg_regno_p (unsigned);
++bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
+ bool aarch64_gen_movmemqi (rtx *);
+ bool aarch64_gimple_fold_builtin (gimple_stmt_iterator *);
+ bool aarch64_is_extend_from_extract (machine_mode, rtx, rtx);
+@@ -335,11 +336,9 @@ machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
+ 						       machine_mode);
+ int aarch64_hard_regno_mode_ok (unsigned, machine_mode);
+ int aarch64_hard_regno_nregs (unsigned, machine_mode);
+-int aarch64_simd_attr_length_move (rtx_insn *);
+ int aarch64_uxt_size (int, HOST_WIDE_INT);
+ int aarch64_vec_fpconst_pow_of_2 (rtx);
+ rtx aarch64_final_eh_return_addr (void);
+-rtx aarch64_legitimize_reload_address (rtx *, machine_mode, int, int, int);
+ rtx aarch64_mask_from_zextract_ops (rtx, rtx);
+ const char *aarch64_output_move_struct (rtx *operands);
+ rtx aarch64_return_addr (int, rtx);
+--- a/src/gcc/config/aarch64/aarch64-simd.md
++++ b/src/gcc/config/aarch64/aarch64-simd.md
+@@ -371,15 +371,15 @@
+   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+ )
+ 
+-(define_insn "*aarch64_mul3_elt_to_128df"
+-  [(set (match_operand:V2DF 0 "register_operand" "=w")
+-     (mult:V2DF
+-       (vec_duplicate:V2DF
+-	 (match_operand:DF 2 "register_operand" "w"))
+-      (match_operand:V2DF 1 "register_operand" "w")))]
++(define_insn "*aarch64_mul3_elt_from_dup<mode>"
++ [(set (match_operand:VMUL 0 "register_operand" "=w")
++    (mult:VMUL
++      (vec_duplicate:VMUL
++	    (match_operand:<VEL> 1 "register_operand" "<h_con>"))
++      (match_operand:VMUL 2 "register_operand" "w")))]
+   "TARGET_SIMD"
+-  "fmul\\t%0.2d, %1.2d, %2.d[0]"
+-  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
++  "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
++  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+ )
+ 
+ (define_insn "aarch64_rsqrte_<mode>2"
+@@ -1579,16 +1579,16 @@
+   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
+ )
+ 
+-(define_insn "*aarch64_fma4_elt_to_128df"
+-  [(set (match_operand:V2DF 0 "register_operand" "=w")
+-    (fma:V2DF
+-      (vec_duplicate:V2DF
+-	  (match_operand:DF 1 "register_operand" "w"))
+-      (match_operand:V2DF 2 "register_operand" "w")
+-      (match_operand:V2DF 3 "register_operand" "0")))]
++(define_insn "*aarch64_fma4_elt_from_dup<mode>"
++  [(set (match_operand:VMUL 0 "register_operand" "=w")
++    (fma:VMUL
++      (vec_duplicate:VMUL
++	  (match_operand:<VEL> 1 "register_operand" "w"))
++      (match_operand:VMUL 2 "register_operand" "w")
++      (match_operand:VMUL 3 "register_operand" "0")))]
+   "TARGET_SIMD"
+-  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
+-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
++  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
++  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
+ )
+ 
+ (define_insn "*aarch64_fma4_elt_to_64v2df"
+@@ -1656,17 +1656,17 @@
+   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
+ )
+ 
+-(define_insn "*aarch64_fnma4_elt_to_128df"
+-  [(set (match_operand:V2DF 0 "register_operand" "=w")
+-    (fma:V2DF
+-      (neg:V2DF
+-        (match_operand:V2DF 2 "register_operand" "w"))
+-      (vec_duplicate:V2DF
+-	(match_operand:DF 1 "register_operand" "w"))
+-      (match_operand:V2DF 3 "register_operand" "0")))]
+-  "TARGET_SIMD"
+-  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
+-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
++(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
++  [(set (match_operand:VMUL 0 "register_operand" "=w")
++    (fma:VMUL
++      (neg:VMUL
++        (match_operand:VMUL 2 "register_operand" "w"))
++      (vec_duplicate:VMUL
++	(match_operand:<VEL> 1 "register_operand" "w"))
++      (match_operand:VMUL 3 "register_operand" "0")))]
++  "TARGET_SIMD"
++  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
++  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
+ )
+ 
+ (define_insn "*aarch64_fnma4_elt_to_64v2df"
+@@ -1979,19 +1979,6 @@
+   }
+ )
+ 
+-(define_expand "reduc_plus_scal_<mode>"
+-  [(match_operand:<VEL> 0 "register_operand" "=w")
+-   (match_operand:V2F 1 "register_operand" "w")]
+-  "TARGET_SIMD"
+-  {
+-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+-    rtx scratch = gen_reg_rtx (<MODE>mode);
+-    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
+-    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+-    DONE;
+-  }
+-)
+-
+ (define_insn "aarch64_reduc_plus_internal<mode>"
+  [(set (match_operand:VDQV 0 "register_operand" "=w")
+        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+@@ -2010,9 +1997,9 @@
+   [(set_attr "type" "neon_reduc_add")]
+ )
+ 
+-(define_insn "aarch64_reduc_plus_internal<mode>"
+- [(set (match_operand:V2F 0 "register_operand" "=w")
+-       (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
++(define_insn "reduc_plus_scal_<mode>"
++ [(set (match_operand:<VEL> 0 "register_operand" "=w")
++       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
+ 		   UNSPEC_FADDV))]
+  "TARGET_SIMD"
+  "faddp\\t%<Vetype>0, %1.<Vtype>"
+@@ -2635,7 +2622,7 @@
+ (define_insn "*aarch64_combinez<mode>"
+   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
+         (vec_concat:<VDBL>
+-	   (match_operand:VD_BHSI 1 "general_operand" "w,r,m")
++	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")
+ 	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))]
+   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+   "@
+@@ -2651,7 +2638,7 @@
+   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
+         (vec_concat:<VDBL>
+ 	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")
+-	   (match_operand:VD_BHSI 1 "general_operand" "w,r,m")))]
++	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")))]
+   "TARGET_SIMD && BYTES_BIG_ENDIAN"
+   "@
+    mov\\t%0.8b, %1.8b
+@@ -4652,7 +4639,7 @@
+    ld1\\t{%S0.16b - %<Vendreg>0.16b}, %1"
+   [(set_attr "type" "multiple,neon_store<nregs>_<nregs>reg_q,\
+ 		     neon_load<nregs>_<nregs>reg_q")
+-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
++   (set_attr "length" "<insn_count>,4,4")]
+ )
+ 
+ (define_insn "aarch64_be_ld1<mode>"
+@@ -4685,7 +4672,7 @@
+    stp\\t%q1, %R1, %0
+    ldp\\t%q0, %R0, %1"
+   [(set_attr "type" "multiple,neon_stp_q,neon_ldp_q")
+-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
++   (set_attr "length" "8,4,4")]
+ )
+ 
+ (define_insn "*aarch64_be_movci"
+@@ -4696,7 +4683,7 @@
+        || register_operand (operands[1], CImode))"
+   "#"
+   [(set_attr "type" "multiple")
+-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
++   (set_attr "length" "12,4,4")]
+ )
+ 
+ (define_insn "*aarch64_be_movxi"
+@@ -4707,7 +4694,7 @@
+        || register_operand (operands[1], XImode))"
+   "#"
+   [(set_attr "type" "multiple")
+-   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
++   (set_attr "length" "16,4,4")]
+ )
+ 
+ (define_split
+@@ -5414,13 +5401,25 @@
+   [(set_attr "type" "crypto_aese")]
+ )
+ 
++;; When AES/AESMC fusion is enabled we want the register allocation to
++;; look like:
++;;    AESE Vn, _
++;;    AESMC Vn, Vn
++;; So prefer to tie operand 1 to operand 0 when fusing.
++
+ (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
+-  [(set (match_operand:V16QI 0 "register_operand" "=w")
+-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
++  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
++	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
+ 	 CRYPTO_AESMC))]
+   "TARGET_SIMD && TARGET_CRYPTO"
+   "aes<aesmc_op>\\t%0.16b, %1.16b"
+-  [(set_attr "type" "crypto_aesmc")]
++  [(set_attr "type" "crypto_aesmc")
++   (set_attr_alternative "enabled"
++     [(if_then_else (match_test
++		       "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
++		     (const_string "yes" )
++		     (const_string "no"))
++      (const_string "yes")])]
+ )
+ 
+ ;; sha1
+--- a/src/gcc/config/aarch64/aarch64.c
++++ b/src/gcc/config/aarch64/aarch64.c
+@@ -3582,7 +3582,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+   return aarch64_tls_referenced_p (x);
+ }
+ 
+-/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
++/* Implement TARGET_CASE_VALUES_THRESHOLD.
++   The expansion for a table switch is quite expensive due to the number
++   of instructions, the table lookup and hard to predict indirect jump.
++   When optimizing for speed, and -O3 enabled, use the per-core tuning if 
++   set, otherwise use tables for > 16 cases as a tradeoff between size and
++   performance.  When optimizing for size, use the default setting.  */
+ 
+ static unsigned int
+ aarch64_case_values_threshold (void)
+@@ -3593,7 +3598,7 @@ aarch64_case_values_threshold (void)
+       && selected_cpu->tune->max_case_values != 0)
+     return selected_cpu->tune->max_case_values;
+   else
+-    return default_case_values_threshold ();
++    return optimize_size ? default_case_values_threshold () : 17;
+ }
+ 
+ /* Return true if register REGNO is a valid index register.
+@@ -4232,14 +4237,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
+       && GET_CODE (x) == NEG)
+     return CC_Zmode;
+ 
+-  /* A compare of a mode narrower than SI mode against zero can be done
+-     by extending the value in the comparison.  */
+-  if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
+-      && y == const0_rtx)
+-    /* Only use sign-extension if we really need it.  */
+-    return ((code == GT || code == GE || code == LE || code == LT)
+-	    ? CC_SESWPmode : CC_ZESWPmode);
+-
+   /* A test for unsigned overflow.  */
+   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
+       && code == NE
+@@ -4308,8 +4305,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
+       break;
+ 
+     case CC_SWPmode:
+-    case CC_ZESWPmode:
+-    case CC_SESWPmode:
+       switch (comp_code)
+ 	{
+ 	case NE: return AARCH64_NE;
+@@ -5022,120 +5017,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
+   return x;
+ }
+ 
+-/* Try a machine-dependent way of reloading an illegitimate address
+-   operand.  If we find one, push the reload and return the new rtx.  */
+-
+-rtx
+-aarch64_legitimize_reload_address (rtx *x_p,
+-				   machine_mode mode,
+-				   int opnum, int type,
+-				   int ind_levels ATTRIBUTE_UNUSED)
+-{
+-  rtx x = *x_p;
+-
+-  /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
+-  if (aarch64_vect_struct_mode_p (mode)
+-      && GET_CODE (x) == PLUS
+-      && REG_P (XEXP (x, 0))
+-      && CONST_INT_P (XEXP (x, 1)))
+-    {
+-      rtx orig_rtx = x;
+-      x = copy_rtx (x);
+-      push_reload (orig_rtx, NULL_RTX, x_p, NULL,
+-		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
+-		   opnum, (enum reload_type) type);
+-      return x;
+-    }
+-
+-  /* We must recognize output that we have already generated ourselves.  */
+-  if (GET_CODE (x) == PLUS
+-      && GET_CODE (XEXP (x, 0)) == PLUS
+-      && REG_P (XEXP (XEXP (x, 0), 0))
+-      && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+-      && CONST_INT_P (XEXP (x, 1)))
+-    {
+-      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
+-		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
+-		   opnum, (enum reload_type) type);
+-      return x;
+-    }
+-
+-  /* We wish to handle large displacements off a base register by splitting
+-     the addend across an add and the mem insn.  This can cut the number of
+-     extra insns needed from 3 to 1.  It is only useful for load/store of a
+-     single register with 12 bit offset field.  */
+-  if (GET_CODE (x) == PLUS
+-      && REG_P (XEXP (x, 0))
+-      && CONST_INT_P (XEXP (x, 1))
+-      && HARD_REGISTER_P (XEXP (x, 0))
+-      && mode != TImode
+-      && mode != TFmode
+-      && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
+-    {
+-      HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
+-      HOST_WIDE_INT low = val & 0xfff;
+-      HOST_WIDE_INT high = val - low;
+-      HOST_WIDE_INT offs;
+-      rtx cst;
+-      machine_mode xmode = GET_MODE (x);
+-
+-      /* In ILP32, xmode can be either DImode or SImode.  */
+-      gcc_assert (xmode == DImode || xmode == SImode);
+-
+-      /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
+-	 BLKmode alignment.  */
+-      if (GET_MODE_SIZE (mode) == 0)
+-	return NULL_RTX;
+-
+-      offs = low % GET_MODE_SIZE (mode);
+-
+-      /* Align misaligned offset by adjusting high part to compensate.  */
+-      if (offs != 0)
+-	{
+-	  if (aarch64_uimm12_shift (high + offs))
+-	    {
+-	      /* Align down.  */
+-	      low = low - offs;
+-	      high = high + offs;
+-	    }
+-	  else
+-	    {
+-	      /* Align up.  */
+-	      offs = GET_MODE_SIZE (mode) - offs;
+-	      low = low + offs;
+-	      high = high + (low & 0x1000) - offs;
+-	      low &= 0xfff;
+-	    }
+-	}
+-
+-      /* Check for overflow.  */
+-      if (high + low != val)
+-	return NULL_RTX;
+-
+-      cst = GEN_INT (high);
+-      if (!aarch64_uimm12_shift (high))
+-	cst = force_const_mem (xmode, cst);
+-
+-      /* Reload high part into base reg, leaving the low part
+-	 in the mem instruction.
+-	 Note that replacing this gen_rtx_PLUS with plus_constant is
+-	 wrong in this case because we rely on the
+-	 (plus (plus reg c1) c2) structure being preserved so that
+-	 XEXP (*p, 0) in push_reload below uses the correct term.  */
+-      x = gen_rtx_PLUS (xmode,
+-			gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
+-			GEN_INT (low));
+-
+-      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
+-		   BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
+-		   opnum, (enum reload_type) type);
+-      return x;
+-    }
+-
+-  return NULL_RTX;
+-}
+-
+-
+ /* Return the reload icode required for a constant pool in mode.  */
+ static enum insn_code
+ aarch64_constant_pool_reload_icode (machine_mode mode)
+@@ -6411,10 +6292,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
+           /* TODO: A write to the CC flags possibly costs extra, this
+ 	     needs encoding in the cost tables.  */
+ 
+-          /* CC_ZESWPmode supports zero extend for free.  */
+-          if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
+-            op0 = XEXP (op0, 0);
+-
+ 	  mode = GET_MODE (op0);
+           /* ANDS.  */
+           if (GET_CODE (op0) == AND)
+@@ -10851,33 +10728,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
+ 		      gen_rtx_REG (mode, rsrc + count - i - 1));
+ }
+ 
+-/* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
+-   one of VSTRUCT modes: OI, CI or XI.  */
+-int
+-aarch64_simd_attr_length_move (rtx_insn *insn)
+-{
+-  machine_mode mode;
+-
+-  extract_insn_cached (insn);
+-
+-  if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
+-    {
+-      mode = GET_MODE (recog_data.operand[0]);
+-      switch (mode)
+-	{
+-	case OImode:
+-	  return 8;
+-	case CImode:
+-	  return 12;
+-	case XImode:
+-	  return 16;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-  return 4;
+-}
+-
+ /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
+    one of VSTRUCT modes: OI, CI, or XI.  */
+ int
+@@ -11959,12 +11809,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
+         info.value = GEN_INT (0);
+       else
+ 	{
+-#define buf_size 20
++	  const unsigned int buf_size = 20;
+ 	  char float_buf[buf_size] = {'\0'};
+ 	  real_to_decimal_for_mode (float_buf,
+ 				    CONST_DOUBLE_REAL_VALUE (info.value),
+ 				    buf_size, buf_size, 1, mode);
+-#undef buf_size
+ 
+ 	  if (lane_count == 1)
+ 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
+@@ -13317,6 +13166,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
+   return false;
+ }
+ 
++/* Return true iff the instruction fusion described by OP is enabled.  */
++
++bool
++aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
++{
++  return (aarch64_tune_params.fusible_ops & op) != 0;
++}
++
+ /* If MEM is in the form of [base+offset], extract the two parts
+    of address and set to BASE and OFFSET, otherwise return false
+    after clearing BASE and OFFSET.  */
+@@ -14232,6 +14089,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+ #undef TARGET_OPTAB_SUPPORTED_P
+ #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
+ 
++#undef TARGET_OMIT_STRUCT_RETURN_REG
++#define TARGET_OMIT_STRUCT_RETURN_REG true
++
+ struct gcc_target targetm = TARGET_INITIALIZER;
+ 
+ #include "gt-aarch64.h"
+--- a/src/gcc/config/aarch64/aarch64.h
++++ b/src/gcc/config/aarch64/aarch64.h
+@@ -652,21 +652,6 @@ typedef struct
+ 
+ #define CONSTANT_ADDRESS_P(X)		aarch64_constant_address_p(X)
+ 
+-/* Try a machine-dependent way of reloading an illegitimate address
+-   operand.  If we find one, push the reload and jump to WIN.  This
+-   macro is used in only one place: `find_reloads_address' in reload.c.  */
+-
+-#define LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, IND_L, WIN)	     \
+-do {									     \
+-  rtx new_x = aarch64_legitimize_reload_address (&(X), MODE, OPNUM, TYPE,    \
+-						 IND_L);		     \
+-  if (new_x)								     \
+-    {									     \
+-      X = new_x;							     \
+-      goto WIN;								     \
+-    }									     \
+-} while (0)
+-
+ #define REGNO_OK_FOR_BASE_P(REGNO)	\
+   aarch64_regno_ok_for_base_p (REGNO, true)
+ 
+@@ -845,7 +830,7 @@ do {									     \
+ #define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)	\
+   aarch64_cannot_change_mode_class (FROM, TO, CLASS)
+ 
+-#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD
++#define SHIFT_COUNT_TRUNCATED (!TARGET_SIMD)
+ 
+ /* Choose appropriate mode for caller saves, so we do the minimum
+    required size of load/store.  */
+--- a/src/gcc/config/aarch64/aarch64.md
++++ b/src/gcc/config/aarch64/aarch64.md
+@@ -1783,7 +1783,7 @@
+   "aarch64_zero_extend_const_eq (<DWI>mode, operands[2],
+ 				 <MODE>mode, operands[1])"
+   "@
+-  cmn\\t%<w>0, %<w>1
++  cmn\\t%<w>0, %1
+   cmp\\t%<w>0, #%n1"
+   [(set_attr "type" "alus_imm")]
+ )
+@@ -1815,11 +1815,11 @@
+   "aarch64_zero_extend_const_eq (<DWI>mode, operands[3],
+                                  <MODE>mode, operands[2])"
+   "@
+-  adds\\t%<w>0, %<w>1, %<w>2
++  adds\\t%<w>0, %<w>1, %2
+   subs\\t%<w>0, %<w>1, #%n2"
+   [(set_attr "type" "alus_imm")]
+ )
+- 
++
+ (define_insn "add<mode>3_compareC"
+   [(set (reg:CC_C CC_REGNUM)
+ 	(ne:CC_C
+@@ -3422,7 +3422,9 @@
+          (LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
+ 		     (match_operand:SI 2 "aarch64_logical_operand" "r,K"))))]
+   ""
+-  "<logical>\\t%w0, %w1, %w2"
++  "@
++   <logical>\\t%w0, %w1, %w2
++   <logical>\\t%w0, %w1, %2"
+   [(set_attr "type" "logic_reg,logic_imm")]
+ )
+ 
+@@ -3435,7 +3437,9 @@
+    (set (match_operand:GPI 0 "register_operand" "=r,r")
+ 	(and:GPI (match_dup 1) (match_dup 2)))]
+   ""
+-  "ands\\t%<w>0, %<w>1, %<w>2"
++  "@
++   ands\\t%<w>0, %<w>1, %<w>2
++   ands\\t%<w>0, %<w>1, %2"
+   [(set_attr "type" "logics_reg,logics_imm")]
+ )
+ 
+@@ -3449,7 +3453,9 @@
+    (set (match_operand:DI 0 "register_operand" "=r,r")
+ 	(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
+   ""
+-  "ands\\t%w0, %w1, %w2"
++  "@
++   ands\\t%w0, %w1, %w2
++   ands\\t%w0, %w1, %2"
+   [(set_attr "type" "logics_reg,logics_imm")]
+ )
+ 
+@@ -3803,7 +3809,9 @@
+ 		  (match_operand:GPI 1 "aarch64_logical_operand" "r,<lconst>"))
+ 	 (const_int 0)))]
+   ""
+-  "tst\\t%<w>0, %<w>1"
++  "@
++   tst\\t%<w>0, %<w>1
++   tst\\t%<w>0, %1"
+   [(set_attr "type" "logics_reg,logics_imm")]
+ )
+ 
+@@ -3869,22 +3877,16 @@
+ (define_expand "ashl<mode>3"
+   [(set (match_operand:SHORT 0 "register_operand")
+ 	(ashift:SHORT (match_operand:SHORT 1 "register_operand")
+-		      (match_operand:QI 2 "nonmemory_operand")))]
++		      (match_operand:QI 2 "const_int_operand")))]
+   ""
+   {
+-    if (CONST_INT_P (operands[2]))
+-      {
+-        operands[2] = GEN_INT (INTVAL (operands[2])
+-                               & (GET_MODE_BITSIZE (<MODE>mode) - 1));
++    operands[2] = GEN_INT (INTVAL (operands[2]) & GET_MODE_MASK (<MODE>mode));
+ 
+-        if (operands[2] == const0_rtx)
+-          {
+-	    emit_insn (gen_mov<mode> (operands[0], operands[1]));
+-	    DONE;
+-          }
++    if (operands[2] == const0_rtx)
++      {
++	emit_insn (gen_mov<mode> (operands[0], operands[1]));
++	DONE;
+       }
+-    else
+-      FAIL;
+   }
+ )
+ 
+@@ -3933,33 +3935,35 @@
+ 
+ ;; Logical left shift using SISD or Integer instruction
+ (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
+-  [(set (match_operand:GPI 0 "register_operand" "=r,w,w")
+-        (ashift:GPI
+-          (match_operand:GPI 1 "register_operand" "r,w,w")
+-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w")))]
++  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
++	(ashift:GPI
++	  (match_operand:GPI 1 "register_operand" "r,r,w,w")
++	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w")))]
+   ""
+   "@
++   lsl\t%<w>0, %<w>1, %2
+    lsl\t%<w>0, %<w>1, %<w>2
+    shl\t%<rtn>0<vas>, %<rtn>1<vas>, %2
+    ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>"
+-  [(set_attr "simd" "no,yes,yes")
+-   (set_attr "type" "shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
++  [(set_attr "simd" "no,no,yes,yes")
++   (set_attr "type" "bfm,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
+ )
+ 
+ ;; Logical right shift using SISD or Integer instruction
+ (define_insn "*aarch64_lshr_sisd_or_int_<mode>3"
+-  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
+-        (lshiftrt:GPI
+-          (match_operand:GPI 1 "register_operand" "r,w,w,w")
+-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w,0")))]
++  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
++	(lshiftrt:GPI
++	 (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
++	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w,0")))]
+   ""
+   "@
++   lsr\t%<w>0, %<w>1, %2
+    lsr\t%<w>0, %<w>1, %<w>2
+    ushr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
+    #
+    #"
+-  [(set_attr "simd" "no,yes,yes,yes")
+-   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
++  [(set_attr "simd" "no,no,yes,yes,yes")
++   (set_attr "type" "bfm,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+ )
+ 
+ (define_split
+@@ -3994,18 +3998,19 @@
+ 
+ ;; Arithmetic right shift using SISD or Integer instruction
+ (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
+-  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
++  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+         (ashiftrt:GPI
+-          (match_operand:GPI 1 "register_operand" "r,w,w,w")
+-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "rUs<cmode>,Us<cmode>,w,0")))]
++          (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
++          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "Us<cmode>,r,Us<cmode>,w,0")))]
+   ""
+   "@
++   asr\t%<w>0, %<w>1, %2
+    asr\t%<w>0, %<w>1, %<w>2
+    sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
+    #
+    #"
+-  [(set_attr "simd" "no,yes,yes,yes")
+-   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
++  [(set_attr "simd" "no,no,yes,yes,yes")
++   (set_attr "type" "bfm,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+ )
+ 
+ (define_split
+@@ -4097,21 +4102,25 @@
+   [(set (match_operand:GPI 0 "register_operand" "=r,r")
+      (rotatert:GPI
+        (match_operand:GPI 1 "register_operand" "r,r")
+-       (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "r,Us<cmode>")))]
++       (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r")))]
+   ""
+-  "ror\\t%<w>0, %<w>1, %<w>2"
+-  [(set_attr "type" "shift_reg, rotate_imm")]
++  "@
++   ror\\t%<w>0, %<w>1, %2
++   ror\\t%<w>0, %<w>1, %<w>2"
++  [(set_attr "type" "rotate_imm,shift_reg")]
+ )
+ 
+ ;; zero_extend version of above
+ (define_insn "*<optab>si3_insn_uxtw"
+-  [(set (match_operand:DI 0 "register_operand" "=r")
++  [(set (match_operand:DI 0 "register_operand" "=r,r")
+ 	(zero_extend:DI (SHIFT:SI
+-	 (match_operand:SI 1 "register_operand" "r")
+-	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "rUss"))))]
++	 (match_operand:SI 1 "register_operand" "r,r")
++	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
+   ""
+-  "<shift>\\t%w0, %w1, %w2"
+-  [(set_attr "type" "shift_reg")]
++  "@
++   <shift>\\t%w0, %w1, %2
++   <shift>\\t%w0, %w1, %w2"
++  [(set_attr "type" "bfm,shift_reg")]
+ )
+ 
+ (define_insn "*<optab><mode>3_insn"
+@@ -4135,7 +4144,7 @@
+   "UINTVAL (operands[3]) < GET_MODE_BITSIZE (<MODE>mode) &&
+    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == GET_MODE_BITSIZE (<MODE>mode))"
+   "extr\\t%<w>0, %<w>1, %<w>2, %4"
+-  [(set_attr "type" "shift_imm")]
++  [(set_attr "type" "rotate_imm")]
+ )
+ 
+ ;; There are no canonicalisation rules for ashift and lshiftrt inside an ior
+@@ -4150,7 +4159,7 @@
+    && (UINTVAL (operands[3]) + UINTVAL (operands[4])
+        == GET_MODE_BITSIZE (<MODE>mode))"
+   "extr\\t%<w>0, %<w>1, %<w>2, %4"
+-  [(set_attr "type" "shift_imm")]
++  [(set_attr "type" "rotate_imm")]
+ )
+ 
+ ;; zero_extend version of the above
+@@ -4164,7 +4173,7 @@
+   "UINTVAL (operands[3]) < 32 &&
+    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
+   "extr\\t%w0, %w1, %w2, %4"
+-  [(set_attr "type" "shift_imm")]
++  [(set_attr "type" "rotate_imm")]
+ )
+ 
+ (define_insn "*extrsi5_insn_uxtw_alt"
+@@ -4177,7 +4186,7 @@
+   "UINTVAL (operands[3]) < 32 &&
+    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
+   "extr\\t%w0, %w1, %w2, %4"
+-  [(set_attr "type" "shift_imm")]
++  [(set_attr "type" "rotate_imm")]
+ )
+ 
+ (define_insn "*ror<mode>3_insn"
+@@ -5191,7 +5200,7 @@
+ 	 UNSPEC_SP_TEST))
+    (clobber (match_scratch:PTR 3 "=&r"))]
+   ""
+-  "ldr\t%<w>3, %x1\;ldr\t%<w>0, %x2\;eor\t%<w>0, %<w>3, %<w>0"
++  "ldr\t%<w>3, %1\;ldr\t%<w>0, %2\;eor\t%<w>0, %<w>3, %<w>0"
+   [(set_attr "length" "12")
+    (set_attr "type" "multiple")])
+ 
+--- a/src/gcc/config/aarch64/arm_neon.h
++++ b/src/gcc/config/aarch64/arm_neon.h
+@@ -7938,61 +7938,6 @@ vmovn_u64 (uint64x2_t a)
+   return result;
+ }
+ 
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vmul_n_f32 (float32x2_t a, float32_t b)
+-{
+-  float32x2_t result;
+-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vmul_n_s16 (int16x4_t a, int16_t b)
+-{
+-  int16x4_t result;
+-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vmul_n_s32 (int32x2_t a, int32_t b)
+-{
+-  int32x2_t result;
+-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vmul_n_u16 (uint16x4_t a, uint16_t b)
+-{
+-  uint16x4_t result;
+-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vmul_n_u32 (uint32x2_t a, uint32_t b)
+-{
+-  uint32x2_t result;
+-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ #define vmull_high_lane_s16(a, b, c)                                    \
+   __extension__                                                         \
+     ({                                                                  \
+@@ -8443,227 +8388,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
+   return result;
+ }
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_f32 (float32x4_t a, float32_t b)
+-{
+-  float32x4_t result;
+-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vmulq_n_f64 (float64x2_t a, float64_t b)
+-{
+-  float64x2_t result;
+-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmulq_n_s16 (int16x8_t a, int16_t b)
+-{
+-  int16x8_t result;
+-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_s32 (int32x4_t a, int32_t b)
+-{
+-  int32x4_t result;
+-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmulq_n_u16 (uint16x8_t a, uint16_t b)
+-{
+-  uint16x8_t result;
+-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_u32 (uint32x4_t a, uint32_t b)
+-{
+-  uint32x4_t result;
+-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vmvn_p8 (poly8x8_t a)
+-{
+-  poly8x8_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vmvn_s8 (int8x8_t a)
+-{
+-  int8x8_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vmvn_s16 (int16x4_t a)
+-{
+-  int16x4_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vmvn_s32 (int32x2_t a)
+-{
+-  int32x2_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vmvn_u8 (uint8x8_t a)
+-{
+-  uint8x8_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vmvn_u16 (uint16x4_t a)
+-{
+-  uint16x4_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vmvn_u32 (uint32x2_t a)
+-{
+-  uint32x2_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vmvnq_p8 (poly8x16_t a)
+-{
+-  poly8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vmvnq_s8 (int8x16_t a)
+-{
+-  int8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmvnq_s16 (int16x8_t a)
+-{
+-  int16x8_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmvnq_s32 (int32x4_t a)
+-{
+-  int32x4_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vmvnq_u8 (uint8x16_t a)
+-{
+-  uint8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmvnq_u16 (uint16x8_t a)
+-{
+-  uint16x8_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmvnq_u32 (uint32x4_t a)
+-{
+-  uint32x4_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-
+ __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+ vpadal_s8 (int16x4_t a, int8x8_t b)
+ {
+@@ -14456,6 +14180,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+   return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
+ }
+ 
++__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
++vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
++{
++  return (float64x1_t) {__b[0] * __c + __a[0]};
++}
++
+ __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+ vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+ {
+@@ -14597,6 +14327,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+   return __builtin_aarch64_fmav2df (-__b, __c, __a);
+ }
+ 
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
++{
++  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
++}
++
++__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
++vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
++{
++  return (float64x1_t) {-__b[0] * __c + __a[0]};
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
++{
++  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
++{
++  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
++}
+ 
+ /* vfms_lane  */
+ 
+@@ -18895,6 +18648,160 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
+   return __a * __aarch64_vget_lane_any (__b, __lane);
+ }
+ 
++/* vmul_n.  */
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vmul_n_f32 (float32x2_t __a, float32_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vmulq_n_f32 (float32x4_t __a, float32_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vmulq_n_f64 (float64x2_t __a, float64_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++vmul_n_s16 (int16x4_t __a, int16_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++vmulq_n_s16 (int16x8_t __a, int16_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++vmul_n_s32 (int32x2_t __a, int32_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++vmulq_n_s32 (int32x4_t __a, int32_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++vmul_n_u16 (uint16x4_t __a, uint16_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++vmul_n_u32 (uint32x2_t __a, uint32_t __b)
++{
++  return __a * __b;
++}
++
++__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
++{
++  return __a * __b;
++}
++
++/* vmvn  */
++
++__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++vmvn_p8 (poly8x8_t __a)
++{
++  return (poly8x8_t) ~((int8x8_t) __a);
++}
++
++__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++vmvn_s8 (int8x8_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++vmvn_s16 (int16x4_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++vmvn_s32 (int32x2_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++vmvn_u8 (uint8x8_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++vmvn_u16 (uint16x4_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++vmvn_u32 (uint32x2_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++vmvnq_p8 (poly8x16_t __a)
++{
++  return (poly8x16_t) ~((int8x16_t) __a);
++}
++
++__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++vmvnq_s8 (int8x16_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++vmvnq_s16 (int16x8_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++vmvnq_s32 (int32x4_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++vmvnq_u8 (uint8x16_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++vmvnq_u16 (uint16x8_t __a)
++{
++  return ~__a;
++}
++
++__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++vmvnq_u32 (uint32x4_t __a)
++{
++  return ~__a;
++}
++
+ /* vneg  */
+ 
+ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+--- a/src/gcc/config/aarch64/iterators.md
++++ b/src/gcc/config/aarch64/iterators.md
+@@ -715,6 +715,7 @@
+ (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+ (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+ 
++;; Sum of lengths of instructions needed to move vector registers of a mode.
+ (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
+ 
+ ;; -fpic small model GOT reloc modifers: gotpage_lo15/lo14 for ILP64/32.
+--- a/src/gcc/config/arm/arm-protos.h
++++ b/src/gcc/config/arm/arm-protos.h
+@@ -319,6 +319,7 @@ extern int vfp3_const_double_for_bits (rtx);
+ 
+ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+ 					   rtx);
++extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
+ extern bool arm_valid_symbolic_address_p (rtx);
+ extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
+ #endif /* RTX_CODE */
+@@ -601,6 +602,9 @@ extern int arm_tune_cortex_a9;
+    interworking clean.  */
+ extern int arm_cpp_interwork;
+ 
++/* Nonzero if chip supports Thumb 1.  */
++extern int arm_arch_thumb1;
++
+ /* Nonzero if chip supports Thumb 2.  */
+ extern int arm_arch_thumb2;
+ 
+--- a/src/gcc/config/arm/arm.c
++++ b/src/gcc/config/arm/arm.c
+@@ -852,6 +852,9 @@ int arm_tune_cortex_a9 = 0;
+    interworking clean.  */
+ int arm_cpp_interwork = 0;
+ 
++/* Nonzero if chip supports Thumb 1.  */
++int arm_arch_thumb1;
++
+ /* Nonzero if chip supports Thumb 2.  */
+ int arm_arch_thumb2;
+ 
+@@ -3170,6 +3173,7 @@ arm_option_override (void)
+   arm_arch7em = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH7EM);
+   arm_arch8 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH8);
+   arm_arch8_1 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_1);
++  arm_arch_thumb1 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB);
+   arm_arch_thumb2 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB2);
+   arm_arch_xscale = ARM_FSET_HAS_CPU1 (insn_flags, FL_XSCALE);
+ 
+@@ -10759,8 +10763,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+       if ((arm_arch4 || GET_MODE (XEXP (x, 0)) == SImode)
+ 	  && MEM_P (XEXP (x, 0)))
+ 	{
+-	  *cost = rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed_p);
+-
+ 	  if (mode == DImode)
+ 	    *cost += COSTS_N_INSNS (1);
+ 
+@@ -15981,14 +15983,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
+   /* If the same input register is used in both stores
+      when storing different constants, try to find a free register.
+      For example, the code
+-        mov r0, 0
+-        str r0, [r2]
+-        mov r0, 1
+-        str r0, [r2, #4]
++	mov r0, 0
++	str r0, [r2]
++	mov r0, 1
++	str r0, [r2, #4]
+      can be transformed into
+-        mov r1, 0
+-        strd r1, r0, [r2]
+-     in Thumb mode assuming that r1 is free.  */
++	mov r1, 0
++	mov r0, 1
++	strd r1, r0, [r2]
++     in Thumb mode assuming that r1 is free.
++     For ARM mode do the same but only if the starting register
++     can be made to be even.  */
+   if (const_store
+       && REGNO (operands[0]) == REGNO (operands[1])
+       && INTVAL (operands[4]) != INTVAL (operands[5]))
+@@ -16007,7 +16012,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
+       }
+     else if (TARGET_ARM)
+       {
+-        return false;
+         int regno = REGNO (operands[0]);
+         if (!peep2_reg_dead_p (4, operands[0]))
+           {
+@@ -29801,6 +29805,13 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
+   return false;
+ }
+ 
++/* Return true iff the instruction fusion described by OP is enabled.  */
++bool
++arm_fusion_enabled_p (tune_params::fuse_ops op)
++{
++  return current_tune->fusible_ops & op;
++}
++
+ /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
+ 
+ static unsigned HOST_WIDE_INT
+--- a/src/gcc/config/arm/arm.h
++++ b/src/gcc/config/arm/arm.h
+@@ -478,6 +478,9 @@ extern int arm_tune_cortex_a9;
+    interworking clean.  */
+ extern int arm_cpp_interwork;
+ 
++/* Nonzero if chip supports Thumb 1.  */
++extern int arm_arch_thumb1;
++
+ /* Nonzero if chip supports Thumb 2.  */
+ extern int arm_arch_thumb2;
+ 
+@@ -2187,13 +2190,9 @@ extern int making_const_table;
+ #define TARGET_ARM_ARCH	\
+   (arm_base_arch)	\
+ 
+-#define TARGET_ARM_V6M (!arm_arch_notm && !arm_arch_thumb2)
+-#define TARGET_ARM_V7M (!arm_arch_notm && arm_arch_thumb2)
+-
+ /* The highest Thumb instruction set version supported by the chip.  */
+-#define TARGET_ARM_ARCH_ISA_THUMB 		\
+-  (arm_arch_thumb2 ? 2				\
+-	           : ((TARGET_ARM_ARCH >= 5 || arm_arch4t) ? 1 : 0))
++#define TARGET_ARM_ARCH_ISA_THUMB		\
++  (arm_arch_thumb2 ? 2 : (arm_arch_thumb1 ? 1 : 0))
+ 
+ /* Expands to an upper-case char of the target's architectural
+    profile.  */
+--- a/src/gcc/config/arm/arm.md
++++ b/src/gcc/config/arm/arm.md
+@@ -8152,8 +8152,8 @@
+ )
+ 
+ (define_insn "probe_stack"
+-  [(set (match_operand 0 "memory_operand" "=m")
+-        (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
++  [(set (match_operand:SI 0 "memory_operand" "=m")
++        (unspec:SI [(const_int 0)] UNSPEC_PROBE_STACK))]
+   "TARGET_32BIT"
+   "str%?\\tr0, %0"
+   [(set_attr "type" "store1")
+--- a/src/gcc/config/arm/arm_neon.h
++++ b/src/gcc/config/arm/arm_neon.h
+@@ -2607,6 +2607,12 @@ vtst_p8 (poly8x8_t __a, poly8x8_t __b)
+   return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+ 
++__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++vtst_p16 (poly16x4_t __a, poly16x4_t __b)
++{
++  return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
++}
++
+ __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+ vtstq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+@@ -2649,6 +2655,12 @@ vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
+   return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+ 
++__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
++{
++  return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
++}
++
+ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+ vabd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+--- a/src/gcc/config/arm/crypto.md
++++ b/src/gcc/config/arm/crypto.md
+@@ -18,14 +18,27 @@
+ ;; along with GCC; see the file COPYING3.  If not see
+ ;; <http://www.gnu.org/licenses/>.
+ 
++
++;; When AES/AESMC fusion is enabled we want the register allocation to
++;; look like:
++;;    AESE Vn, _
++;;    AESMC Vn, Vn
++;; So prefer to tie operand 1 to operand 0 when fusing.
++
+ (define_insn "crypto_<crypto_pattern>"
+-  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
++  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
+         (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
+-                       "register_operand" "w")]
++                       "register_operand" "0,w")]
+          CRYPTO_UNARY))]
+   "TARGET_CRYPTO"
+   "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
+-  [(set_attr "type" "<crypto_type>")]
++  [(set_attr "type" "<crypto_type>")
++   (set_attr_alternative "enabled"
++     [(if_then_else (match_test
++		       "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
++		     (const_string "yes" )
++		     (const_string "no"))
++      (const_string "yes")])]
+ )
+ 
+ (define_insn "crypto_<crypto_pattern>"
+--- a/src/gcc/config/arm/sync.md
++++ b/src/gcc/config/arm/sync.md
+@@ -452,14 +452,13 @@
+   {
+     if (<MODE>mode == DImode)
+       {
+-	rtx value = operands[2];
+ 	/* The restrictions on target registers in ARM mode are that the two
+ 	   registers are consecutive and the first one is even; Thumb is
+ 	   actually more flexible, but DI should give us this anyway.
+-	   Note that the 1st register always gets the lowest word in memory.  */
+-	gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
+-	operands[3] = gen_rtx_REG (SImode, REGNO (value) + 1);
+-	return "strexd%?\t%0, %2, %3, %C1";
++	   Note that the 1st register always gets the
++	   lowest word in memory.  */
++	gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
++	return "strexd%?\t%0, %2, %H2, %C1";
+       }
+     return "strex<sync_sfx>%?\t%0, %2, %C1";
+   }
+@@ -475,11 +474,9 @@
+ 	  VUNSPEC_SLX))]
+   "TARGET_HAVE_LDACQ && ARM_DOUBLEWORD_ALIGN"
+   {
+-    rtx value = operands[2];
+     /* See comment in arm_store_exclusive<mode> above.  */
+-    gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
+-    operands[3] = gen_rtx_REG (SImode, REGNO (value) + 1);
+-    return "stlexd%?\t%0, %2, %3, %C1";
++    gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
++    return "stlexd%?\t%0, %2, %H2, %C1";
+   }
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+--- a/src/gcc/config/arm/thumb1.md
++++ b/src/gcc/config/arm/thumb1.md
+@@ -142,11 +142,11 @@
+    (set_attr "type" "alus_sreg")]
+ )
+ 
+-; Unfortunately with the Thumb the '&'/'0' trick can fails when operands
+-; 1 and 2; are the same, because reload will make operand 0 match
+-; operand 1 without realizing that this conflicts with operand 2.  We fix
+-; this by adding another alternative to match this case, and then `reload'
+-; it ourselves.  This alternative must come first.
++;; Unfortunately on Thumb the '&'/'0' trick can fail when operands
++;; 1 and 2 are the same, because reload will make operand 0 match
++;; operand 1 without realizing that this conflicts with operand 2.  We fix
++;; this by adding another alternative to match this case, and then `reload'
++;; it ourselves.  This alternative must come first.
+ (define_insn "*thumb_mulsi3"
+   [(set (match_operand:SI          0 "register_operand" "=&l,&l,&l")
+ 	(mult:SI (match_operand:SI 1 "register_operand" "%l,*h,0")
+--- a/src/gcc/configure
++++ b/src/gcc/configure
+@@ -1711,7 +1711,8 @@ Optional Packages:
+   --with-stabs            arrange to use stabs instead of host debug format
+   --with-dwarf2           force the default debug format to be DWARF 2
+   --with-specs=SPECS      add SPECS to driver command-line processing
+-  --with-pkgversion=PKG   Use PKG in the version string in place of "GCC"
++  --with-pkgversion=PKG   Use PKG in the version string in place of "Linaro
++                          GCC `cat $srcdir/LINARO-VERSION`"
+   --with-bugurl=URL       Direct users to URL to report a bug
+   --with-multilib-list    select multilibs (AArch64, SH and x86-64 only)
+   --with-gnu-ld           assume the C compiler uses GNU ld default=no
+@@ -7651,7 +7652,7 @@ if test "${with_pkgversion+set}" = set; then :
+       *)   PKGVERSION="($withval) " ;;
+      esac
+ else
+-  PKGVERSION="(GCC) "
++  PKGVERSION="(Linaro GCC `cat $srcdir/LINARO-VERSION`) "
+ 
+ fi
+ 
+@@ -18453,7 +18454,7 @@ else
+   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+   lt_status=$lt_dlunknown
+   cat > conftest.$ac_ext <<_LT_EOF
+-#line 18456 "configure"
++#line 18457 "configure"
+ #include "confdefs.h"
+ 
+ #if HAVE_DLFCN_H
+@@ -18559,7 +18560,7 @@ else
+   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+   lt_status=$lt_dlunknown
+   cat > conftest.$ac_ext <<_LT_EOF
+-#line 18562 "configure"
++#line 18563 "configure"
+ #include "confdefs.h"
+ 
+ #if HAVE_DLFCN_H
+--- a/src/gcc/configure.ac
++++ b/src/gcc/configure.ac
+@@ -903,7 +903,7 @@ AC_ARG_WITH(specs,
+ )
+ AC_SUBST(CONFIGURE_SPECS)
+ 
+-ACX_PKGVERSION([GCC])
++ACX_PKGVERSION([Linaro GCC `cat $srcdir/LINARO-VERSION`])
+ ACX_BUGURL([http://gcc.gnu.org/bugs.html])
+ 
+ # Sanity check enable_languages in case someone does not run the toplevel
+--- a/src/gcc/cppbuiltin.c
++++ b/src/gcc/cppbuiltin.c
+@@ -52,18 +52,41 @@ parse_basever (int *major, int *minor, int *patchlevel)
+     *patchlevel = s_patchlevel;
+ }
+ 
++/* Parse a LINAROVER version string of the format "M.m-year.month[-spin][~dev]"
++   to create Linaro release number YYYYMM and spin version.  */
++static void
++parse_linarover (int *release, int *spin)
++{
++  static int s_year = -1, s_month, s_spin;
++
++  if (s_year == -1)
++    if (sscanf (LINAROVER, "%*[^-]-%d.%d-%d", &s_year, &s_month, &s_spin) != 3)
++      {
++	sscanf (LINAROVER, "%*[^-]-%d.%d", &s_year, &s_month);
++	s_spin = 0;
++      }
++
++  if (release)
++    *release = s_year * 100 + s_month;
++
++  if (spin)
++    *spin = s_spin;
++}
+ 
+ /* Define __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ and __VERSION__.  */
+ static void
+ define__GNUC__ (cpp_reader *pfile)
+ {
+-  int major, minor, patchlevel;
++  int major, minor, patchlevel, linaro_release, linaro_spin;
+ 
+   parse_basever (&major, &minor, &patchlevel);
++  parse_linarover (&linaro_release, &linaro_spin);
+   cpp_define_formatted (pfile, "__GNUC__=%d", major);
+   cpp_define_formatted (pfile, "__GNUC_MINOR__=%d", minor);
+   cpp_define_formatted (pfile, "__GNUC_PATCHLEVEL__=%d", patchlevel);
+   cpp_define_formatted (pfile, "__VERSION__=\"%s\"", version_string);
++  cpp_define_formatted (pfile, "__LINARO_RELEASE__=%d", linaro_release);
++  cpp_define_formatted (pfile, "__LINARO_SPIN__=%d", linaro_spin);
+   cpp_define_formatted (pfile, "__ATOMIC_RELAXED=%d", MEMMODEL_RELAXED);
+   cpp_define_formatted (pfile, "__ATOMIC_SEQ_CST=%d", MEMMODEL_SEQ_CST);
+   cpp_define_formatted (pfile, "__ATOMIC_ACQUIRE=%d", MEMMODEL_ACQUIRE);
+--- a/src/gcc/internal-fn.c
++++ b/src/gcc/internal-fn.c
+@@ -1807,11 +1807,7 @@ expand_arith_overflow (enum tree_code code, gimple *stmt)
+       /* For sub-word operations, retry with a wider type first.  */
+       if (orig_precres == precres && precop <= BITS_PER_WORD)
+ 	{
+-#if WORD_REGISTER_OPERATIONS
+-	  int p = BITS_PER_WORD;
+-#else
+-	  int p = precop;
+-#endif
++	  int p = WORD_REGISTER_OPERATIONS ? BITS_PER_WORD : precop;
+ 	  enum machine_mode m = smallest_mode_for_size (p, MODE_INT);
+ 	  tree optype = build_nonstandard_integer_type (GET_MODE_PRECISION (m),
+ 							uns0_p && uns1_p
+--- a/src/gcc/lto/lto-partition.c
++++ b/src/gcc/lto/lto-partition.c
+@@ -447,7 +447,7 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition)
+    and in-partition calls was reached.  */
+ 
+ void
+-lto_balanced_map (int n_lto_partitions)
++lto_balanced_map (int n_lto_partitions, int max_partition_size)
+ {
+   int n_nodes = 0;
+   int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+@@ -511,6 +511,9 @@ lto_balanced_map (int n_lto_partitions)
+   varpool_order.qsort (varpool_node_cmp);
+ 
+   /* Compute partition size and create the first partition.  */
++  if (PARAM_VALUE (MIN_PARTITION_SIZE) > max_partition_size)
++    fatal_error (input_location, "min partition size cannot be greater than max partition size");
++
+   partition_size = total_size / n_lto_partitions;
+   if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE))
+     partition_size = PARAM_VALUE (MIN_PARTITION_SIZE);
+@@ -719,7 +722,8 @@ lto_balanced_map (int n_lto_partitions)
+ 		 best_cost, best_internal, best_i);
+       /* Partition is too large, unwind into step when best cost was reached and
+ 	 start new partition.  */
+-      if (partition->insns > 2 * partition_size)
++      if (partition->insns > 2 * partition_size
++	  || partition->insns > max_partition_size)
+ 	{
+ 	  if (best_i != i)
+ 	    {
+--- a/src/gcc/lto/lto-partition.h
++++ b/src/gcc/lto/lto-partition.h
+@@ -35,7 +35,7 @@ extern vec<ltrans_partition> ltrans_partitions;
+ 
+ void lto_1_to_1_map (void);
+ void lto_max_map (void);
+-void lto_balanced_map (int);
++void lto_balanced_map (int, int);
+ void lto_promote_cross_file_statics (void);
+ void free_ltrans_partitions (void);
+ void lto_promote_statics_nonwpa (void);
+--- a/src/gcc/lto/lto.c
++++ b/src/gcc/lto/lto.c
+@@ -3117,9 +3117,10 @@ do_whole_program_analysis (void)
+   else if (flag_lto_partition == LTO_PARTITION_MAX)
+     lto_max_map ();
+   else if (flag_lto_partition == LTO_PARTITION_ONE)
+-    lto_balanced_map (1);
++    lto_balanced_map (1, INT_MAX);
+   else if (flag_lto_partition == LTO_PARTITION_BALANCED)
+-    lto_balanced_map (PARAM_VALUE (PARAM_LTO_PARTITIONS));
++    lto_balanced_map (PARAM_VALUE (PARAM_LTO_PARTITIONS),
++		      PARAM_VALUE (MAX_PARTITION_SIZE));
+   else
+     gcc_unreachable ();
+ 
+--- a/src/gcc/params.def
++++ b/src/gcc/params.def
+@@ -1027,7 +1027,12 @@ DEFPARAM (PARAM_LTO_PARTITIONS,
+ DEFPARAM (MIN_PARTITION_SIZE,
+ 	  "lto-min-partition",
+ 	  "Minimal size of a partition for LTO (in estimated instructions).",
+-	  1000, 0, 0)
++	  10000, 0, 0)
++
++DEFPARAM (MAX_PARTITION_SIZE,
++	  "lto-max-partition",
++	  "Maximal size of a partition for LTO (in estimated instructions).",
++	  1000000, 0, INT_MAX)
+ 
+ /* Diagnostic parameters.  */
+ 
+--- a/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
++++ b/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
+@@ -1,6 +1,8 @@
+ // { dg-lto-do link }
+-// { dg-lto-options "-O2 -rdynamic" } 
++// { dg-lto-options "-O2 -rdynamic" }
+ // { dg-extra-ld-options "-r -nostdlib" }
++// { dg-skip-if "Skip targets without -rdynamic support" { arm*-none-eabi aarch64*-*-elf } { "*" } { "" } }
++
+ #pragma GCC visibility push(hidden)
+ struct A { int &operator[] (long); };
+ template <typename> struct B;
+--- a/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
++++ b/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
+@@ -87,6 +87,12 @@ foreach plugin_test $plugin_test_list {
+     if ![runtest_file_p $runtests $plugin_src] then {
+         continue
+     }
++    # Skip tail call tests on targets that do not have sibcall_epilogue.
++    if {[regexp ".*must_tail_call_plugin.c" $plugin_src]
++	&& [istarget arm*-*-*]
++	&& [check_effective_target_arm_thumb1]} then {
++	continue
++    }
+     set plugin_input_tests [lreplace $plugin_test 0 0]
+     plugin-test-execute $plugin_src $plugin_input_tests
+ }
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/scev-11.c
+@@ -0,0 +1,28 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
++
++int a[128];
++extern int b[];
++
++int bar (int *);
++
++int
++foo (int n)
++{
++  int i;
++
++  for (i = 0; i < n; i++)
++    {
++      unsigned char uc = (unsigned char)i;
++      a[i] = i;
++      b[uc] = 0;
++    }
++
++  bar (a);
++  return 0;
++}
++
++/* Address of array reference to b is scev.  */
++/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 2 "ivopts" } } */
++
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/scev-12.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
++
++int a[128];
++extern int b[];
++
++int bar (int *);
++
++int
++foo (int x, int n)
++{
++  int i;
++
++  for (i = 0; i < n; i++)
++    {
++      unsigned char uc = (unsigned char)i;
++      if (x)
++	a[i] = i;
++      b[uc] = 0;
++    }
++
++  bar (a);
++  return 0;
++}
++
++/* Address of array reference to b is not scev.  */
++/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 1 "ivopts" } } */
++
++
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/vect/pr57206.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target vect_float } */
++
++void bad0(float * d, unsigned int n)
++{
++  unsigned int i;
++  for (i=n; i>0; --i) 
++    d[n-i] = 0.0;
++}
++
++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+@@ -81,7 +81,7 @@ extern size_t strlen(const char *);
+ 	  abort();							\
+ 	}								\
+       }									\
+-    fprintf(stderr, "CHECKED %s\n", MSG);				\
++    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
+   }
+ 
+ /* Floating-point variant.  */
+@@ -110,7 +110,7 @@ extern size_t strlen(const char *);
+ 	  abort();							\
+ 	}								\
+       }									\
+-    fprintf(stderr, "CHECKED %s\n", MSG);				\
++    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
+   }
+ 
+ /* Clean buffer with a non-zero pattern to help diagnose buffer
+@@ -133,10 +133,16 @@ static ARRAY(result, uint, 32, 2);
+ static ARRAY(result, uint, 64, 1);
+ static ARRAY(result, poly, 8, 8);
+ static ARRAY(result, poly, 16, 4);
++#if defined (__ARM_FEATURE_CRYPTO)
++static ARRAY(result, poly, 64, 1);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ static ARRAY(result, float, 16, 4);
+ #endif
+ static ARRAY(result, float, 32, 2);
++#ifdef __aarch64__
++static ARRAY(result, float, 64, 1);
++#endif
+ static ARRAY(result, int, 8, 16);
+ static ARRAY(result, int, 16, 8);
+ static ARRAY(result, int, 32, 4);
+@@ -147,6 +153,9 @@ static ARRAY(result, uint, 32, 4);
+ static ARRAY(result, uint, 64, 2);
+ static ARRAY(result, poly, 8, 16);
+ static ARRAY(result, poly, 16, 8);
++#if defined (__ARM_FEATURE_CRYPTO)
++static ARRAY(result, poly, 64, 2);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ static ARRAY(result, float, 16, 8);
+ #endif
+@@ -169,6 +178,7 @@ extern ARRAY(expected, poly, 8, 8);
+ extern ARRAY(expected, poly, 16, 4);
+ extern ARRAY(expected, hfloat, 16, 4);
+ extern ARRAY(expected, hfloat, 32, 2);
++extern ARRAY(expected, hfloat, 64, 1);
+ extern ARRAY(expected, int, 8, 16);
+ extern ARRAY(expected, int, 16, 8);
+ extern ARRAY(expected, int, 32, 4);
+@@ -335,7 +345,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
+ 	      strlen(COMMENT) > 0 ? " " COMMENT : "");			\
+       abort();								\
+     }									\
+-    fprintf(stderr, "CHECKED CUMULATIVE SAT %s\n", MSG);		\
++    fprintf(stderr, "CHECKED CUMULATIVE SAT %s %s\n",			\
++	    STR(VECT_TYPE(T, W, N)), MSG);				\
+   }
+ 
+ #define CHECK_CUMULATIVE_SAT_NAMED(test_name,EXPECTED,comment)		\
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
+@@ -118,6 +118,10 @@ VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
+ PAD(buffer_pad, uint, 32, 2);
+ VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
+ PAD(buffer_pad, uint, 64, 1);
++#if defined (__ARM_FEATURE_CRYPTO)
++VECT_VAR_DECL_INIT(buffer, poly, 64, 1);
++PAD(buffer_pad, poly, 64, 1);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ VECT_VAR_DECL_INIT(buffer, float, 16, 4);
+ PAD(buffer_pad, float, 16, 4);
+@@ -144,6 +148,10 @@ VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
+ PAD(buffer_pad, poly, 8, 16);
+ VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
+ PAD(buffer_pad, poly, 16, 8);
++#if defined (__ARM_FEATURE_CRYPTO)
++VECT_VAR_DECL_INIT(buffer, poly, 64, 2);
++PAD(buffer_pad, poly, 64, 2);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ VECT_VAR_DECL_INIT(buffer, float, 16, 8);
+ PAD(buffer_pad, float, 16, 8);
+@@ -178,6 +186,10 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
+ VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
+ VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
+ VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
++#if defined (__ARM_FEATURE_CRYPTO)
++VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 1);
++VECT_VAR_DECL(buffer_dup_pad, poly, 64, 1);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ VECT_VAR_DECL_INIT4(buffer_dup, float, 16, 4);
+ VECT_VAR_DECL(buffer_dup_pad, float, 16, 4);
+@@ -205,6 +217,10 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
+ VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
+ VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
+ VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
++#if defined (__ARM_FEATURE_CRYPTO)
++VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 2);
++VECT_VAR_DECL(buffer_dup_pad, poly, 64, 2);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ VECT_VAR_DECL_INIT(buffer_dup, float, 16, 8);
+ VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
+@@ -0,0 +1,663 @@
++/* This file contains tests for all the *p64 intrinsics, except for
++   vreinterpret which have their own testcase.  */
++
++/* { dg-require-effective-target arm_crypto_ok } */
++/* { dg-add-options arm_crypto } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results: vbsl.  */
++VECT_VAR_DECL(vbsl_expected,poly,64,1) [] = { 0xfffffff1 };
++VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
++					      0xfffffff1 };
++
++/* Expected results: vceq.  */
++VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
++
++/* Expected results: vcombine.  */
++VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
++
++/* Expected results: vcreate.  */
++VECT_VAR_DECL(vcreate_expected,poly,64,1) [] = { 0x123456789abcdef0 };
++
++/* Expected results: vdup_lane.  */
++VECT_VAR_DECL(vdup_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vdup_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0xfffffffffffffff0 };
++
++/* Expected results: vdup_n.  */
++VECT_VAR_DECL(vdup_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vdup_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
++						 0xfffffffffffffff0 };
++VECT_VAR_DECL(vdup_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vdup_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
++						 0xfffffffffffffff1 };
++VECT_VAR_DECL(vdup_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
++						 0xfffffffffffffff2 };
++
++/* Expected results: vext.  */
++VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 };
++
++/* Expected results: vget_low.  */
++VECT_VAR_DECL(vget_low_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
++
++/* Expected results: vld1.  */
++VECT_VAR_DECL(vld1_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld1_expected,poly,64,2) [] = { 0xfffffffffffffff0,
++					      0xfffffffffffffff1 };
++
++/* Expected results: vld1_dup.  */
++VECT_VAR_DECL(vld1_dup_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld1_dup_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0xfffffffffffffff0 };
++VECT_VAR_DECL(vld1_dup_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld1_dup_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
++						   0xfffffffffffffff1 };
++VECT_VAR_DECL(vld1_dup_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vld1_dup_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
++						   0xfffffffffffffff2 };
++
++/* Expected results: vld1_lane.  */
++VECT_VAR_DECL(vld1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0xaaaaaaaaaaaaaaaa };
++
++/* Expected results: vldX.  */
++VECT_VAR_DECL(vld2_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld2_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld3_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld3_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld3_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vld4_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld4_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld4_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vld4_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
++
++/* Expected results: vldX_dup.  */
++VECT_VAR_DECL(vld2_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld2_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld3_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld3_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld3_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vld4_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vld4_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vld4_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vld4_dup_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
++
++/* Expected results: vsli.  */
++VECT_VAR_DECL(vsli_expected,poly,64,1) [] = { 0x10 };
++VECT_VAR_DECL(vsli_expected,poly,64,2) [] = { 0x7ffffffffffff0,
++					      0x7ffffffffffff1 };
++VECT_VAR_DECL(vsli_expected_max_shift,poly,64,1) [] = { 0x7ffffffffffffff0 };
++VECT_VAR_DECL(vsli_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
++							0xfffffffffffffff1 };
++
++/* Expected results: vsri.  */
++VECT_VAR_DECL(vsri_expected,poly,64,1) [] = { 0xe000000000000000 };
++VECT_VAR_DECL(vsri_expected,poly,64,2) [] = { 0xfffffffffffff800,
++					      0xfffffffffffff800 };
++VECT_VAR_DECL(vsri_expected_max_shift,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vsri_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
++							0xfffffffffffffff1 };
++
++/* Expected results: vst1_lane.  */
++VECT_VAR_DECL(vst1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0x3333333333333333 };
++
++int main (void)
++{
++  int i;
++
++  /* vbsl_p64 tests.  */
++#define TEST_MSG "VBSL/VBSLQ"
++
++#define TEST_VBSL(T3, Q, T1, T2, W, N)					\
++  VECT_VAR(vbsl_vector_res, T1, W, N) =					\
++    vbsl##Q##_##T2##W(VECT_VAR(vbsl_vector_first, T3, W, N),		\
++		      VECT_VAR(vbsl_vector, T1, W, N),			\
++		      VECT_VAR(vbsl_vector2, T1, W, N));		\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vbsl_vector_res, T1, W, N))
++
++  DECL_VARIABLE(vbsl_vector, poly, 64, 1);
++  DECL_VARIABLE(vbsl_vector, poly, 64, 2);
++  DECL_VARIABLE(vbsl_vector2, poly, 64, 1);
++  DECL_VARIABLE(vbsl_vector2, poly, 64, 2);
++  DECL_VARIABLE(vbsl_vector_res, poly, 64, 1);
++  DECL_VARIABLE(vbsl_vector_res, poly, 64, 2);
++
++  DECL_VARIABLE(vbsl_vector_first, uint, 64, 1);
++  DECL_VARIABLE(vbsl_vector_first, uint, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vbsl_vector, buffer, , poly, p, 64, 1);
++  VLOAD(vbsl_vector, buffer, q, poly, p, 64, 2);
++
++  VDUP(vbsl_vector2, , poly, p, 64, 1, 0xFFFFFFF3);
++  VDUP(vbsl_vector2, q, poly, p, 64, 2, 0xFFFFFFF3);
++
++  VDUP(vbsl_vector_first, , uint, u, 64, 1, 0xFFFFFFF2);
++  VDUP(vbsl_vector_first, q, uint, u, 64, 2, 0xFFFFFFF2);
++
++  TEST_VBSL(uint, , poly, p, 64, 1);
++  TEST_VBSL(uint, q, poly, p, 64, 2);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vbsl_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vbsl_expected, "");
++
++  /* vceq_p64 tests. */
++#undef TEST_MSG
++#define TEST_MSG "VCEQ"
++
++#define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
++  VECT_VAR(vceq_vector_res, T3, W, N) =					\
++    INSN##Q##_##T2##W(VECT_VAR(vceq_vector, T1, W, N),			\
++		      VECT_VAR(vceq_vector2, T1, W, N));		\
++  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceq_vector_res, T3, W, N))
++
++#define TEST_VCOMP(INSN, Q, T1, T2, T3, W, N)				\
++  TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
++
++  DECL_VARIABLE(vceq_vector, poly, 64, 1);
++  DECL_VARIABLE(vceq_vector2, poly, 64, 1);
++  DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
++
++  CLEAN(result, uint, 64, 1);
++
++  VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
++
++  VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
++
++  TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
++
++  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
++
++  /* vcombine_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VCOMBINE"
++
++#define TEST_VCOMBINE(T1, T2, W, N, N2)					\
++  VECT_VAR(vcombine_vector128, T1, W, N2) =				\
++    vcombine_##T2##W(VECT_VAR(vcombine_vector64_a, T1, W, N),		\
++		     VECT_VAR(vcombine_vector64_b, T1, W, N));		\
++  vst1q_##T2##W(VECT_VAR(result, T1, W, N2), VECT_VAR(vcombine_vector128, T1, W, N2))
++
++  DECL_VARIABLE(vcombine_vector64_a, poly, 64, 1);
++  DECL_VARIABLE(vcombine_vector64_b, poly, 64, 1);
++  DECL_VARIABLE(vcombine_vector128, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vcombine_vector64_a, buffer, , poly, p, 64, 1);
++
++  VDUP(vcombine_vector64_b, , poly, p, 64, 1, 0x88);
++
++  TEST_VCOMBINE(poly, p, 64, 1, 2);
++
++  CHECK(TEST_MSG, poly, 64, 2, PRIx16, vcombine_expected, "");
++
++  /* vcreate_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VCREATE"
++
++#define TEST_VCREATE(T1, T2, W, N)					\
++  VECT_VAR(vcreate_vector_res, T1, W, N) =				\
++    vcreate_##T2##W(VECT_VAR(vcreate_val, T1, W, N));			\
++  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vcreate_vector_res, T1, W, N))
++
++#define DECL_VAL(VAR, T1, W, N)			\
++  uint64_t VECT_VAR(VAR, T1, W, N)
++
++  DECL_VAL(vcreate_val, poly, 64, 1);
++  DECL_VARIABLE(vcreate_vector_res, poly, 64, 1);
++
++  CLEAN(result, poly, 64, 2);
++
++  VECT_VAR(vcreate_val, poly, 64, 1) = 0x123456789abcdef0ULL;
++
++  TEST_VCREATE(poly, p, 64, 1);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vcreate_expected, "");
++
++  /* vdup_lane_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VDUP_LANE/VDUP_LANEQ"
++
++#define TEST_VDUP_LANE(Q, T1, T2, W, N, N2, L)				\
++  VECT_VAR(vdup_lane_vector_res, T1, W, N) =				\
++    vdup##Q##_lane_##T2##W(VECT_VAR(vdup_lane_vector, T1, W, N2), L);	\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_lane_vector_res, T1, W, N))
++
++  DECL_VARIABLE(vdup_lane_vector, poly, 64, 1);
++  DECL_VARIABLE(vdup_lane_vector, poly, 64, 2);
++  DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 1);
++  DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vdup_lane_vector, buffer, , poly, p, 64, 1);
++
++  TEST_VDUP_LANE(, poly, p, 64, 1, 1, 0);
++  TEST_VDUP_LANE(q, poly, p, 64, 2, 1, 0);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_lane_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_lane_expected, "");
++
++  /* vdup_n_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VDUP/VDUPQ"
++
++#define TEST_VDUP(Q, T1, T2, W, N)					\
++  VECT_VAR(vdup_n_vector, T1, W, N) =					\
++    vdup##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_n_vector, T1, W, N))
++
++  DECL_VARIABLE(vdup_n_vector, poly, 64, 1);
++  DECL_VARIABLE(vdup_n_vector, poly, 64, 2);
++
++  /* Try to read different places from the input buffer.  */
++  for (i=0; i< 3; i++) {
++    CLEAN(result, poly, 64, 1);
++    CLEAN(result, poly, 64, 2);
++
++    TEST_VDUP(, poly, p, 64, 1);
++    TEST_VDUP(q, poly, p, 64, 2);
++
++    switch (i) {
++    case 0:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected0, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected0, "");
++      break;
++    case 1:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected1, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected1, "");
++      break;
++    case 2:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected2, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected2, "");
++      break;
++    default:
++      abort();
++    }
++  }
++
++  /* vexit_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VEXT/VEXTQ"
++
++#define TEST_VEXT(Q, T1, T2, W, N, V)					\
++  VECT_VAR(vext_vector_res, T1, W, N) =					\
++    vext##Q##_##T2##W(VECT_VAR(vext_vector1, T1, W, N),			\
++		      VECT_VAR(vext_vector2, T1, W, N),			\
++		      V);						\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vext_vector_res, T1, W, N))
++
++  DECL_VARIABLE(vext_vector1, poly, 64, 1);
++  DECL_VARIABLE(vext_vector1, poly, 64, 2);
++  DECL_VARIABLE(vext_vector2, poly, 64, 1);
++  DECL_VARIABLE(vext_vector2, poly, 64, 2);
++  DECL_VARIABLE(vext_vector_res, poly, 64, 1);
++  DECL_VARIABLE(vext_vector_res, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vext_vector1, buffer, , poly, p, 64, 1);
++  VLOAD(vext_vector1, buffer, q, poly, p, 64, 2);
++
++  VDUP(vext_vector2, , poly, p, 64, 1, 0x88);
++  VDUP(vext_vector2, q, poly, p, 64, 2, 0x88);
++
++  TEST_VEXT(, poly, p, 64, 1, 0);
++  TEST_VEXT(q, poly, p, 64, 2, 1);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vext_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vext_expected, "");
++
++  /* vget_low_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VGET_LOW"
++
++#define TEST_VGET_LOW(T1, T2, W, N, N2)					\
++  VECT_VAR(vget_low_vector64, T1, W, N) =				\
++    vget_low_##T2##W(VECT_VAR(vget_low_vector128, T1, W, N2));		\
++  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_low_vector64, T1, W, N))
++
++  DECL_VARIABLE(vget_low_vector64, poly, 64, 1);
++  DECL_VARIABLE(vget_low_vector128, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++
++  VLOAD(vget_low_vector128, buffer, q, poly, p, 64, 2);
++
++  TEST_VGET_LOW(poly, p, 64, 1, 2);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
++
++  /* vld1_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VLD1/VLD1Q"
++
++#define TEST_VLD1(VAR, BUF, Q, T1, T2, W, N)				\
++  VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N)); \
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
++
++  DECL_VARIABLE(vld1_vector, poly, 64, 1);
++  DECL_VARIABLE(vld1_vector, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vld1_vector, buffer, , poly, p, 64, 1);
++  VLOAD(vld1_vector, buffer, q, poly, p, 64, 2);
++
++  TEST_VLD1(vld1_vector, buffer, , poly, p, 64, 1);
++  TEST_VLD1(vld1_vector, buffer, q, poly, p, 64, 2);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
++
++  /* vld1_dup_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VLD1_DUP/VLD1_DUPQ"
++
++#define TEST_VLD1_DUP(VAR, BUF, Q, T1, T2, W, N)			\
++  VECT_VAR(VAR, T1, W, N) =						\
++    vld1##Q##_dup_##T2##W(&VECT_VAR(BUF, T1, W, N)[i]);			\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
++
++  DECL_VARIABLE(vld1_dup_vector, poly, 64, 1);
++  DECL_VARIABLE(vld1_dup_vector, poly, 64, 2);
++
++  /* Try to read different places from the input buffer.  */
++  for (i=0; i<3; i++) {
++    CLEAN(result, poly, 64, 1);
++    CLEAN(result, poly, 64, 2);
++
++    TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, , poly, p, 64, 1);
++    TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, q, poly, p, 64, 2);
++
++    switch (i) {
++    case 0:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
++      break;
++    case 1:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
++      break;
++    case 2:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
++      break;
++    default:
++      abort();
++    }
++  }
++
++  /* vld1_lane_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VLD1_LANE/VLD1_LANEQ"
++
++#define TEST_VLD1_LANE(Q, T1, T2, W, N, L)				\
++  memset (VECT_VAR(vld1_lane_buffer_src, T1, W, N), 0xAA, W/8*N);	\
++  VECT_VAR(vld1_lane_vector_src, T1, W, N) =				\
++    vld1##Q##_##T2##W(VECT_VAR(vld1_lane_buffer_src, T1, W, N));	\
++  VECT_VAR(vld1_lane_vector, T1, W, N) =				\
++    vld1##Q##_lane_##T2##W(VECT_VAR(buffer, T1, W, N),			\
++			   VECT_VAR(vld1_lane_vector_src, T1, W, N), L); \
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vld1_lane_vector, T1, W, N))
++
++  DECL_VARIABLE(vld1_lane_vector, poly, 64, 1);
++  DECL_VARIABLE(vld1_lane_vector, poly, 64, 2);
++  DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 1);
++  DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 2);
++
++  ARRAY(vld1_lane_buffer_src, poly, 64, 1);
++  ARRAY(vld1_lane_buffer_src, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  TEST_VLD1_LANE(, poly, p, 64, 1, 0);
++  TEST_VLD1_LANE(q, poly, p, 64, 2, 0);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_lane_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_lane_expected, "");
++
++  /* vldX_p64 tests.  */
++#define DECL_VLDX(T1, W, N, X)						\
++  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_vector, T1, W, N, X); \
++  VECT_VAR_DECL(vldX_result_bis_##X, T1, W, N)[X * N]
++
++#define TEST_VLDX(Q, T1, T2, W, N, X)					\
++  VECT_ARRAY_VAR(vldX_vector, T1, W, N, X) =				\
++    /* Use dedicated init buffer, of size X */				\
++    vld##X##Q##_##T2##W(VECT_ARRAY_VAR(buffer_vld##X, T1, W, N, X));	\
++  vst##X##Q##_##T2##W(VECT_VAR(vldX_result_bis_##X, T1, W, N),		\
++		      VECT_ARRAY_VAR(vldX_vector, T1, W, N, X));	\
++  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_result_bis_##X, T1, W, N), \
++	 sizeof(VECT_VAR(result, T1, W, N)));
++
++  /* Overwrite "result" with the contents of "result_bis"[Y].  */
++#define TEST_EXTRA_CHUNK(T1, W, N, X,Y)				\
++  memcpy(VECT_VAR(result, T1, W, N),				\
++	 &(VECT_VAR(vldX_result_bis_##X, T1, W, N)[Y*N]),	\
++	 sizeof(VECT_VAR(result, T1, W, N)));
++
++  DECL_VLDX(poly, 64, 1, 2);
++  DECL_VLDX(poly, 64, 1, 3);
++  DECL_VLDX(poly, 64, 1, 4);
++
++  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 1);
++  PAD(buffer_vld2_pad, poly, 64, 1);
++  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 1);
++  PAD(buffer_vld3_pad, poly, 64, 1);
++  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 1);
++  PAD(buffer_vld4_pad, poly, 64, 1);
++
++#undef TEST_MSG
++#define TEST_MSG "VLD2/VLD2Q"
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX(, poly, p, 64, 1, 2);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
++  CLEAN(result, poly, 64, 1);
++  TEST_EXTRA_CHUNK(poly, 64, 1, 2, 1);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
++
++#undef TEST_MSG
++#define TEST_MSG "VLD3/VLD3Q"
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX(, poly, p, 64, 1, 3);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
++  CLEAN(result, poly, 64, 1);
++  TEST_EXTRA_CHUNK(poly, 64, 1, 3, 1);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
++  CLEAN(result, poly, 64, 1);
++  TEST_EXTRA_CHUNK(poly, 64, 1, 3, 2);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
++
++#undef TEST_MSG
++#define TEST_MSG "VLD4/VLD4Q"
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX(, poly, p, 64, 1, 4);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
++  CLEAN(result, poly, 64, 1);
++  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 1);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
++  CLEAN(result, poly, 64, 1);
++  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 2);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
++  CLEAN(result, poly, 64, 1);
++  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 3);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
++
++  /* vldX_dup_p64 tests.  */
++#define DECL_VLDX_DUP(T1, W, N, X)					\
++  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X); \
++  VECT_VAR_DECL(vldX_dup_result_bis_##X, T1, W, N)[X * N]
++
++#define TEST_VLDX_DUP(Q, T1, T2, W, N, X)				\
++  VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X) =			\
++    vld##X##Q##_dup_##T2##W(&VECT_VAR(buffer_dup, T1, W, N)[0]);	\
++    									\
++  vst##X##Q##_##T2##W(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N),	\
++		      VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X));	\
++  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_dup_result_bis_##X, T1, W, N), \
++	 sizeof(VECT_VAR(result, T1, W, N)));
++
++  /* Overwrite "result" with the contents of "result_bis"[Y].  */
++#define TEST_VLDX_DUP_EXTRA_CHUNK(T1, W, N, X,Y)		\
++  memcpy(VECT_VAR(result, T1, W, N),				\
++	 &(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N)[Y*N]),	\
++	 sizeof(VECT_VAR(result, T1, W, N)));
++
++  DECL_VLDX_DUP(poly, 64, 1, 2);
++  DECL_VLDX_DUP(poly, 64, 1, 3);
++  DECL_VLDX_DUP(poly, 64, 1, 4);
++
++
++#undef TEST_MSG
++#define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP(, poly, p, 64, 1, 2);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 2, 1);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
++
++#undef TEST_MSG
++#define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP(, poly, p, 64, 1, 3);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 1);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 2);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
++
++#undef TEST_MSG
++#define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP(, poly, p, 64, 1, 4);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 1);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 2);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
++  CLEAN(result, poly, 64, 1);
++  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 3);
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
++
++  /* vsli_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VSLI"
++
++#define TEST_VSXI1(INSN, Q, T1, T2, W, N, V)				\
++  VECT_VAR(vsXi_vector_res, T1, W, N) =					\
++    INSN##Q##_n_##T2##W(VECT_VAR(vsXi_vector, T1, W, N),		\
++		      VECT_VAR(vsXi_vector2, T1, W, N),			\
++		      V);						\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vsXi_vector_res, T1, W, N))
++
++#define TEST_VSXI(INSN, Q, T1, T2, W, N, V)	\
++  TEST_VSXI1(INSN, Q, T1, T2, W, N, V)
++
++  DECL_VARIABLE(vsXi_vector, poly, 64, 1);
++  DECL_VARIABLE(vsXi_vector, poly, 64, 2);
++  DECL_VARIABLE(vsXi_vector2, poly, 64, 1);
++  DECL_VARIABLE(vsXi_vector2, poly, 64, 2);
++  DECL_VARIABLE(vsXi_vector_res, poly, 64, 1);
++  DECL_VARIABLE(vsXi_vector_res, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
++  VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
++
++  VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
++  VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
++
++  TEST_VSXI(vsli, , poly, p, 64, 1, 3);
++  TEST_VSXI(vsli, q, poly, p, 64, 2, 53);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
++
++  /* Test cases with maximum shift amount.  */
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  TEST_VSXI(vsli, , poly, p, 64, 1, 63);
++  TEST_VSXI(vsli, q, poly, p, 64, 2, 63);
++
++#define COMMENT "(max shift amount)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
++
++  /* vsri_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VSRI"
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
++  VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
++
++  VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
++  VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
++
++  TEST_VSXI(vsri, , poly, p, 64, 1, 3);
++  TEST_VSXI(vsri, q, poly, p, 64, 2, 53);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected, "");
++
++  /* Test cases with maximum shift amount.  */
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  TEST_VSXI(vsri, , poly, p, 64, 1, 64);
++  TEST_VSXI(vsri, q, poly, p, 64, 2, 64);
++
++#define COMMENT "(max shift amount)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected_max_shift, COMMENT);
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected_max_shift, COMMENT);
++
++  /* vst1_lane_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VST1_LANE/VST1_LANEQ"
++
++#define TEST_VST1_LANE(Q, T1, T2, W, N, L)				\
++  VECT_VAR(vst1_lane_vector, T1, W, N) =				\
++    vld1##Q##_##T2##W(VECT_VAR(buffer, T1, W, N));			\
++  vst1##Q##_lane_##T2##W(VECT_VAR(result, T1, W, N),			\
++			 VECT_VAR(vst1_lane_vector, T1, W, N), L)
++
++  DECL_VARIABLE(vst1_lane_vector, poly, 64, 1);
++  DECL_VARIABLE(vst1_lane_vector, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++  CLEAN(result, poly, 64, 2);
++
++  TEST_VST1_LANE(, poly, p, 64, 1, 0);
++  TEST_VST1_LANE(q, poly, p, 64, 2, 0);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
++  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
++
++  return 0;
++}
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
+@@ -0,0 +1,490 @@
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
++
++#define A0 123.4f
++#define A1 -3.8f
++#define A2 -29.4f
++#define A3 (__builtin_inff ())
++#define A4 0.0f
++#define A5 24.0f
++#define A6 124.0f
++#define A7 1024.0f
++
++#define B0 -5.8f
++#define B1 -0.0f
++#define B2 -10.8f
++#define B3 10.0f
++#define B4 23.4f
++#define B5 -1234.8f
++#define B6 8.9f
++#define B7 4.0f
++
++#define E0 9.8f
++#define E1 -1024.0f
++#define E2 (-__builtin_inff ())
++#define E3 479.0f
++float32_t elem0 = E0;
++float32_t elem1 = E1;
++float32_t elem2 = E2;
++float32_t elem3 = E3;
++
++#define DA0 1231234.4
++#define DA1 -3.8
++#define DA2 -2980.4
++#define DA3 -5.8
++#define DA4 0.01123
++#define DA5 24.0
++#define DA6 124.12345
++#define DA7 1024.0
++
++#define DB0 -5.8
++#define DB1 (__builtin_inf ())
++#define DB2 -105.8
++#define DB3 10.0
++#define DB4 (-__builtin_inf ())
++#define DB5 -1234.8
++#define DB6 848.9
++#define DB7 44444.0
++
++#define DE0 9.8
++#define DE1 -1024.0
++#define DE2 105.8
++#define DE3 479.0
++float64_t delem0 = DE0;
++float64_t delem1 = DE1;
++float64_t delem2 = DE2;
++float64_t delem3 = DE3;
++
++/* Expected results for vfms_n.  */
++
++VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
++VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
++VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
++VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
++VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
++VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
++VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
++VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
++
++hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
++hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
++  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
++
++
++VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
++						A2 + -B2 * E0, A3 + -B3 * E0};
++VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
++						A6 + -B6 * E1, A7 + -B7 * E1};
++VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
++						A4 + -B4 * E2, A6 + -B6 * E2};
++VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
++						A5 + -B5 * E3, A7 + -B7 * E3};
++VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
++						A2 + B2 * E0, A3 + B3 * E0};
++VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
++						A6 + B6 * E1, A7 + B7 * E1};
++VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
++						A4 + B4 * E2, A6 + B6 * E2};
++VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
++						A5 + B5 * E3, A7 + B7 * E3};
++
++hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
++hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
++  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
++
++VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
++						DA1 + -DB1 * DE0};
++VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
++						DA3 + -DB3 * DE1};
++VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
++						DA5 + -DB5 * DE2};
++VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
++						DA7 + -DB7 * DE3};
++VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
++						DA1 + DB1 * DE0};
++VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
++						DA3 + DB3 * DE1};
++VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
++						DA5 + DB5 * DE2};
++VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
++						DA7 + DB7 * DE3};
++hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
++hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
++  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
++
++VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
++VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
++VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
++VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
++VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
++VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
++VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
++VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
++
++hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
++hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
++  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
++
++void exec_vfma_vfms_n (void)
++{
++#undef TEST_MSG
++#define TEST_MSG "VFMS_VFMA_N (FP32)"
++  clean_results ();
++
++  DECL_VARIABLE(vsrc_1, float, 32, 2);
++  DECL_VARIABLE(vsrc_2, float, 32, 2);
++  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
++  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
++  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
++  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
++  DECL_VARIABLE (vector_res, float, 32, 2) =
++    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem0);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem0);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
++
++  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
++  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
++  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
++  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem1);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem1);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
++
++  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
++  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
++  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
++  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem2);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem2);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
++
++  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
++  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
++  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
++  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem3);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
++  VECT_VAR (vector_res, float, 32, 2) =
++    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
++		VECT_VAR (vsrc_2, float, 32, 2), elem3);
++  vst1_f32 (VECT_VAR (result, float, 32, 2),
++	    VECT_VAR (vector_res, float, 32, 2));
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
++
++#undef TEST_MSG
++#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
++  clean_results ();
++
++  DECL_VARIABLE(vsrc_1, float, 32, 4);
++  DECL_VARIABLE(vsrc_2, float, 32, 4);
++  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
++  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
++  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
++  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
++  DECL_VARIABLE (vector_res, float, 32, 4) =
++    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
++
++  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
++  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
++  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
++  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
++
++  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
++  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
++  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
++  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
++
++  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
++  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
++  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
++  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
++  VECT_VAR (vector_res, float, 32, 4) =
++    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
++		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
++  vst1q_f32 (VECT_VAR (result, float, 32, 4),
++	     VECT_VAR (vector_res, float, 32, 4));
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
++
++#undef TEST_MSG
++#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
++  clean_results ();
++
++  DECL_VARIABLE(vsrc_1, float, 64, 2);
++  DECL_VARIABLE(vsrc_2, float, 64, 2);
++  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
++  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
++  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
++  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
++  DECL_VARIABLE (vector_res, float, 64, 2) =
++    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
++
++  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
++  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
++  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
++  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
++
++  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
++  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
++  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
++  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
++
++  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
++  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
++  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
++  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
++  VECT_VAR (vector_res, float, 64, 2) =
++    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
++		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
++  vst1q_f64 (VECT_VAR (result, float, 64, 2),
++	     VECT_VAR (vector_res, float, 64, 2));
++  CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
++
++#undef TEST_MSG
++#define TEST_MSG "VFMS_VFMA_N (FP64)"
++  clean_results ();
++
++  DECL_VARIABLE(vsrc_1, float, 64, 1);
++  DECL_VARIABLE(vsrc_2, float, 64, 1);
++  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
++  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
++  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
++  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
++  DECL_VARIABLE (vector_res, float, 64, 1) =
++    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem0);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem0);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
++
++  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
++  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
++  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
++  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem1);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem1);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
++
++  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
++  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
++  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
++  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem2);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem2);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
++
++  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
++  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
++  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
++  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem3);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
++  VECT_VAR (vector_res, float, 64, 1) =
++    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
++		VECT_VAR (vsrc_2, float, 64, 1), delem3);
++  vst1_f64 (VECT_VAR (result, float, 64, 1),
++	     VECT_VAR (vector_res, float, 64, 1));
++  CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
++}
++#endif
++
++int
++main (void)
++{
++#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
++  exec_vfma_vfms_n ();
++#endif
++  return 0;
++}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
+@@ -13,6 +13,7 @@ uint32_t   expected_u32  = 0xfffffff1;
+ uint64_t   expected_u64  = 0xfffffffffffffff0;
+ poly8_t    expected_p8   = 0xf6;
+ poly16_t   expected_p16  = 0xfff2;
++hfloat16_t expected_f16  = 0xcb80;
+ hfloat32_t expected_f32  = 0xc1700000;
+ 
+ int8_t     expectedq_s8  = 0xff;
+@@ -25,6 +26,7 @@ uint32_t   expectedq_u32 = 0xfffffff2;
+ uint64_t   expectedq_u64 = 0xfffffffffffffff1;
+ poly8_t    expectedq_p8  = 0xfe;
+ poly16_t   expectedq_p16 = 0xfff6;
++hfloat16_t expectedq_f16 = 0xca80;
+ hfloat32_t expectedq_f32 = 0xc1500000;
+ 
+ int error_found = 0;
+@@ -52,6 +54,10 @@ void exec_vget_lane (void)
+     uint32_t var_int32;
+     float32_t var_float32;
+   } var_int32_float32;
++  union {
++    uint16_t var_int16;
++    float16_t var_float16;
++  } var_int16_float16;
+ 
+ #define TEST_VGET_LANE_FP(Q, T1, T2, W, N, L)				   \
+   VAR(var, T1, W) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
+@@ -81,10 +87,17 @@ void exec_vget_lane (void)
+   VAR_DECL(var, uint, 64);
+   VAR_DECL(var, poly, 8);
+   VAR_DECL(var, poly, 16);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  VAR_DECL(var, float, 16);
++#endif
+   VAR_DECL(var, float, 32);
+ 
+   /* Initialize input values.  */
+   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  VLOAD(vector, buffer, , float, f, 16, 4);
++  VLOAD(vector, buffer, q, float, f, 16, 8);
++#endif
+   VLOAD(vector, buffer, , float, f, 32, 2);
+   VLOAD(vector, buffer, q, float, f, 32, 4);
+ 
+@@ -99,6 +112,9 @@ void exec_vget_lane (void)
+   TEST_VGET_LANE(, uint, u, 64, 1, 0);
+   TEST_VGET_LANE(, poly, p, 8, 8, 6);
+   TEST_VGET_LANE(, poly, p, 16, 4, 2);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  TEST_VGET_LANE_FP(, float, f, 16, 4, 1);
++#endif
+   TEST_VGET_LANE_FP(, float, f, 32, 2, 1);
+ 
+   TEST_VGET_LANE(q, int, s, 8, 16, 15);
+@@ -111,6 +127,9 @@ void exec_vget_lane (void)
+   TEST_VGET_LANE(q, uint, u, 64, 2, 1);
+   TEST_VGET_LANE(q, poly, p, 8, 16, 14);
+   TEST_VGET_LANE(q, poly, p, 16, 8, 6);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  TEST_VGET_LANE_FP(q, float, f, 16, 8, 3);
++#endif
+   TEST_VGET_LANE_FP(q, float, f, 32, 4, 3);
+ }
+ 
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
+@@ -37,10 +37,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x60, 0xca, 0x34, 0x9e,
+ VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4c73333, 0xc4bac000,
+ 					   0xc4ae4ccd, 0xc4a1d999 };
+ 
+-#ifndef INSN_NAME
+ #define INSN_NAME vmul
+ #define TEST_MSG "VMUL"
+-#endif
+ 
+ #define FNNAME1(NAME) exec_ ## NAME
+ #define FNNAME(NAME) FNNAME1(NAME)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
+@@ -21,6 +21,8 @@ VECT_VAR_DECL(expected_s8_8,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+ 					    0xf4, 0xf5, 0xf6, 0xf7 };
+ VECT_VAR_DECL(expected_s8_9,int,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
+ 					    0xf2, 0xff, 0xf3, 0xff };
++VECT_VAR_DECL(expected_s8_10,int,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
++					     0x00, 0xcb, 0x80, 0xca };
+ 
+ /* Expected results for vreinterpret_s16_xx.  */
+ VECT_VAR_DECL(expected_s16_1,int,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+@@ -32,6 +34,7 @@ VECT_VAR_DECL(expected_s16_6,int,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
+ VECT_VAR_DECL(expected_s16_7,int,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+ VECT_VAR_DECL(expected_s16_8,int,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+ VECT_VAR_DECL(expected_s16_9,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
++VECT_VAR_DECL(expected_s16_10,int,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+ 
+ /* Expected results for vreinterpret_s32_xx.  */
+ VECT_VAR_DECL(expected_s32_1,int,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
+@@ -43,6 +46,7 @@ VECT_VAR_DECL(expected_s32_6,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
+ VECT_VAR_DECL(expected_s32_7,int,32,2) [] = { 0xfffffff0, 0xffffffff };
+ VECT_VAR_DECL(expected_s32_8,int,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
+ VECT_VAR_DECL(expected_s32_9,int,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
++VECT_VAR_DECL(expected_s32_10,int,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
+ 
+ /* Expected results for vreinterpret_s64_xx.  */
+ VECT_VAR_DECL(expected_s64_1,int,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+@@ -54,6 +58,7 @@ VECT_VAR_DECL(expected_s64_6,int,64,1) [] = { 0xfffffff1fffffff0 };
+ VECT_VAR_DECL(expected_s64_7,int,64,1) [] = { 0xfffffffffffffff0 };
+ VECT_VAR_DECL(expected_s64_8,int,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+ VECT_VAR_DECL(expected_s64_9,int,64,1) [] = { 0xfff3fff2fff1fff0 };
++VECT_VAR_DECL(expected_s64_10,int,64,1) [] = { 0xca80cb00cb80cc00 };
+ 
+ /* Expected results for vreinterpret_u8_xx.  */
+ VECT_VAR_DECL(expected_u8_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+@@ -74,6 +79,8 @@ VECT_VAR_DECL(expected_u8_8,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+ 					     0xf4, 0xf5, 0xf6, 0xf7 };
+ VECT_VAR_DECL(expected_u8_9,uint,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
+ 					     0xf2, 0xff, 0xf3, 0xff };
++VECT_VAR_DECL(expected_u8_10,uint,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
++					      0x00, 0xcb, 0x80, 0xca };
+ 
+ /* Expected results for vreinterpret_u16_xx.  */
+ VECT_VAR_DECL(expected_u16_1,uint,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+@@ -85,6 +92,7 @@ VECT_VAR_DECL(expected_u16_6,uint,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
+ VECT_VAR_DECL(expected_u16_7,uint,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+ VECT_VAR_DECL(expected_u16_8,uint,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+ VECT_VAR_DECL(expected_u16_9,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
++VECT_VAR_DECL(expected_u16_10,uint,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+ 
+ /* Expected results for vreinterpret_u32_xx.  */
+ VECT_VAR_DECL(expected_u32_1,uint,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
+@@ -96,6 +104,7 @@ VECT_VAR_DECL(expected_u32_6,uint,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
+ VECT_VAR_DECL(expected_u32_7,uint,32,2) [] = { 0xfffffff0, 0xffffffff };
+ VECT_VAR_DECL(expected_u32_8,uint,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
+ VECT_VAR_DECL(expected_u32_9,uint,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
++VECT_VAR_DECL(expected_u32_10,uint,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
+ 
+ /* Expected results for vreinterpret_u64_xx.  */
+ VECT_VAR_DECL(expected_u64_1,uint,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+@@ -107,6 +116,7 @@ VECT_VAR_DECL(expected_u64_6,uint,64,1) [] = { 0xfff3fff2fff1fff0 };
+ VECT_VAR_DECL(expected_u64_7,uint,64,1) [] = { 0xfffffff1fffffff0 };
+ VECT_VAR_DECL(expected_u64_8,uint,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
+ VECT_VAR_DECL(expected_u64_9,uint,64,1) [] = { 0xfff3fff2fff1fff0 };
++VECT_VAR_DECL(expected_u64_10,uint,64,1) [] = { 0xca80cb00cb80cc00 };
+ 
+ /* Expected results for vreinterpret_p8_xx.  */
+ VECT_VAR_DECL(expected_p8_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+@@ -127,6 +137,8 @@ VECT_VAR_DECL(expected_p8_8,poly,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
+ 					     0xff, 0xff, 0xff, 0xff };
+ VECT_VAR_DECL(expected_p8_9,poly,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
+ 					     0xf2, 0xff, 0xf3, 0xff };
++VECT_VAR_DECL(expected_p8_10,poly,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
++					      0x00, 0xcb, 0x80, 0xca };
+ 
+ /* Expected results for vreinterpret_p16_xx.  */
+ VECT_VAR_DECL(expected_p16_1,poly,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
+@@ -138,6 +150,7 @@ VECT_VAR_DECL(expected_p16_6,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+ VECT_VAR_DECL(expected_p16_7,poly,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
+ VECT_VAR_DECL(expected_p16_8,poly,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
+ VECT_VAR_DECL(expected_p16_9,poly,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
++VECT_VAR_DECL(expected_p16_10,poly,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+ 
+ /* Expected results for vreinterpretq_s8_xx.  */
+ VECT_VAR_DECL(expected_q_s8_1,int,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+@@ -176,6 +189,10 @@ VECT_VAR_DECL(expected_q_s8_9,int,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+ 					       0xf2, 0xff, 0xf3, 0xff,
+ 					       0xf4, 0xff, 0xf5, 0xff,
+ 					       0xf6, 0xff, 0xf7, 0xff };
++VECT_VAR_DECL(expected_q_s8_10,int,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
++						0x00, 0xcb, 0x80, 0xca,
++						0x00, 0xca, 0x80, 0xc9,
++						0x00, 0xc9, 0x80, 0xc8 };
+ 
+ /* Expected results for vreinterpretq_s16_xx.  */
+ VECT_VAR_DECL(expected_q_s16_1,int,16,8) [] = { 0xf1f0, 0xf3f2,
+@@ -214,6 +231,10 @@ VECT_VAR_DECL(expected_q_s16_9,int,16,8) [] = { 0xfff0, 0xfff1,
+ 						0xfff2, 0xfff3,
+ 						0xfff4, 0xfff5,
+ 						0xfff6, 0xfff7 };
++VECT_VAR_DECL(expected_q_s16_10,int,16,8) [] = { 0xcc00, 0xcb80,
++						 0xcb00, 0xca80,
++						 0xca00, 0xc980,
++						 0xc900, 0xc880 };
+ 
+ /* Expected results for vreinterpretq_s32_xx.  */
+ VECT_VAR_DECL(expected_q_s32_1,int,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
+@@ -234,6 +255,8 @@ VECT_VAR_DECL(expected_q_s32_8,int,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
+ 						0xfbfaf9f8, 0xfffefdfc };
+ VECT_VAR_DECL(expected_q_s32_9,int,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
+ 						0xfff5fff4, 0xfff7fff6 };
++VECT_VAR_DECL(expected_q_s32_10,int,32,4) [] = { 0xcb80cc00, 0xca80cb00,
++						 0xc980ca00, 0xc880c900 };
+ 
+ /* Expected results for vreinterpretq_s64_xx.  */
+ VECT_VAR_DECL(expected_q_s64_1,int,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+@@ -254,6 +277,8 @@ VECT_VAR_DECL(expected_q_s64_8,int,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+ 						0xfffefdfcfbfaf9f8 };
+ VECT_VAR_DECL(expected_q_s64_9,int,64,2) [] = { 0xfff3fff2fff1fff0,
+ 						0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(expected_q_s64_10,int,64,2) [] = { 0xca80cb00cb80cc00,
++						 0xc880c900c980ca00 };
+ 
+ /* Expected results for vreinterpretq_u8_xx.  */
+ VECT_VAR_DECL(expected_q_u8_1,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+@@ -292,6 +317,10 @@ VECT_VAR_DECL(expected_q_u8_9,uint,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+ 						0xf2, 0xff, 0xf3, 0xff,
+ 						0xf4, 0xff, 0xf5, 0xff,
+ 						0xf6, 0xff, 0xf7, 0xff };
++VECT_VAR_DECL(expected_q_u8_10,uint,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
++						 0x00, 0xcb, 0x80, 0xca,
++						 0x00, 0xca, 0x80, 0xc9,
++						 0x00, 0xc9, 0x80, 0xc8 };
+ 
+ /* Expected results for vreinterpretq_u16_xx.  */
+ VECT_VAR_DECL(expected_q_u16_1,uint,16,8) [] = { 0xf1f0, 0xf3f2,
+@@ -330,6 +359,10 @@ VECT_VAR_DECL(expected_q_u16_9,uint,16,8) [] = { 0xfff0, 0xfff1,
+ 						 0xfff2, 0xfff3,
+ 						 0xfff4, 0xfff5,
+ 						 0xfff6, 0xfff7 };
++VECT_VAR_DECL(expected_q_u16_10,uint,16,8) [] = { 0xcc00, 0xcb80,
++						  0xcb00, 0xca80,
++						  0xca00, 0xc980,
++						  0xc900, 0xc880 };
+ 
+ /* Expected results for vreinterpretq_u32_xx.  */
+ VECT_VAR_DECL(expected_q_u32_1,uint,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
+@@ -350,6 +383,8 @@ VECT_VAR_DECL(expected_q_u32_8,uint,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
+ 						 0xfbfaf9f8, 0xfffefdfc };
+ VECT_VAR_DECL(expected_q_u32_9,uint,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
+ 						 0xfff5fff4, 0xfff7fff6 };
++VECT_VAR_DECL(expected_q_u32_10,uint,32,4) [] = { 0xcb80cc00, 0xca80cb00,
++						  0xc980ca00, 0xc880c900 };
+ 
+ /* Expected results for vreinterpretq_u64_xx.  */
+ VECT_VAR_DECL(expected_q_u64_1,uint,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+@@ -370,6 +405,92 @@ VECT_VAR_DECL(expected_q_u64_8,uint,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
+ 						0xfffefdfcfbfaf9f8 };
+ VECT_VAR_DECL(expected_q_u64_9,uint,64,2) [] = { 0xfff3fff2fff1fff0,
+ 						 0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(expected_q_u64_10,uint,64,2) [] = { 0xca80cb00cb80cc00,
++						  0xc880c900c980ca00 };
++
++/* Expected results for vreinterpretq_p8_xx.  */
++VECT_VAR_DECL(expected_q_p8_1,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
++						0xf4, 0xf5, 0xf6, 0xf7,
++						0xf8, 0xf9, 0xfa, 0xfb,
++						0xfc, 0xfd, 0xfe, 0xff };
++VECT_VAR_DECL(expected_q_p8_2,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
++						0xf2, 0xff, 0xf3, 0xff,
++						0xf4, 0xff, 0xf5, 0xff,
++						0xf6, 0xff, 0xf7, 0xff };
++VECT_VAR_DECL(expected_q_p8_3,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++						0xf1, 0xff, 0xff, 0xff,
++						0xf2, 0xff, 0xff, 0xff,
++						0xf3, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(expected_q_p8_4,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++						0xff, 0xff, 0xff, 0xff,
++						0xf1, 0xff, 0xff, 0xff,
++						0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(expected_q_p8_5,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
++						0xf4, 0xf5, 0xf6, 0xf7,
++						0xf8, 0xf9, 0xfa, 0xfb,
++						0xfc, 0xfd, 0xfe, 0xff };
++VECT_VAR_DECL(expected_q_p8_6,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
++						0xf2, 0xff, 0xf3, 0xff,
++						0xf4, 0xff, 0xf5, 0xff,
++						0xf6, 0xff, 0xf7, 0xff };
++VECT_VAR_DECL(expected_q_p8_7,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++						0xf1, 0xff, 0xff, 0xff,
++						0xf2, 0xff, 0xff, 0xff,
++						0xf3, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(expected_q_p8_8,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++						0xff, 0xff, 0xff, 0xff,
++						0xf1, 0xff, 0xff, 0xff,
++						0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(expected_q_p8_9,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
++						0xf2, 0xff, 0xf3, 0xff,
++						0xf4, 0xff, 0xf5, 0xff,
++						0xf6, 0xff, 0xf7, 0xff };
++VECT_VAR_DECL(expected_q_p8_10,poly,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
++						 0x00, 0xcb, 0x80, 0xca,
++						 0x00, 0xca, 0x80, 0xc9,
++						 0x00, 0xc9, 0x80, 0xc8 };
++
++/* Expected results for vreinterpretq_p16_xx.  */
++VECT_VAR_DECL(expected_q_p16_1,poly,16,8) [] = { 0xf1f0, 0xf3f2,
++						 0xf5f4, 0xf7f6,
++						 0xf9f8, 0xfbfa,
++						 0xfdfc, 0xfffe };
++VECT_VAR_DECL(expected_q_p16_2,poly,16,8) [] = { 0xfff0, 0xfff1,
++						 0xfff2, 0xfff3,
++						 0xfff4, 0xfff5,
++						 0xfff6, 0xfff7 };
++VECT_VAR_DECL(expected_q_p16_3,poly,16,8) [] = { 0xfff0, 0xffff,
++						 0xfff1, 0xffff,
++						 0xfff2, 0xffff,
++						 0xfff3, 0xffff };
++VECT_VAR_DECL(expected_q_p16_4,poly,16,8) [] = { 0xfff0, 0xffff,
++						 0xffff, 0xffff,
++						 0xfff1, 0xffff,
++						 0xffff, 0xffff };
++VECT_VAR_DECL(expected_q_p16_5,poly,16,8) [] = { 0xf1f0, 0xf3f2,
++						 0xf5f4, 0xf7f6,
++						 0xf9f8, 0xfbfa,
++						 0xfdfc, 0xfffe };
++VECT_VAR_DECL(expected_q_p16_6,poly,16,8) [] = { 0xfff0, 0xfff1,
++						 0xfff2, 0xfff3,
++						 0xfff4, 0xfff5,
++						 0xfff6, 0xfff7 };
++VECT_VAR_DECL(expected_q_p16_7,poly,16,8) [] = { 0xfff0, 0xffff,
++						 0xfff1, 0xffff,
++						 0xfff2, 0xffff,
++						 0xfff3, 0xffff };
++VECT_VAR_DECL(expected_q_p16_8,poly,16,8) [] = { 0xfff0, 0xffff,
++						 0xffff, 0xffff,
++						 0xfff1, 0xffff,
++						 0xffff, 0xffff };
++VECT_VAR_DECL(expected_q_p16_9,poly,16,8) [] = { 0xf1f0, 0xf3f2,
++						 0xf5f4, 0xf7f6,
++						 0xf9f8, 0xfbfa,
++						 0xfdfc, 0xfffe };
++VECT_VAR_DECL(expected_q_p16_10,poly,16,8) [] = { 0xcc00, 0xcb80,
++						  0xcb00, 0xca80,
++						  0xca00, 0xc980,
++						  0xc900, 0xc880 };
+ 
+ /* Expected results for vreinterpret_f32_xx.  */
+ VECT_VAR_DECL(expected_f32_1,hfloat,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
+@@ -382,6 +503,7 @@ VECT_VAR_DECL(expected_f32_7,hfloat,32,2) [] = { 0xfffffff0, 0xfffffff1 };
+ VECT_VAR_DECL(expected_f32_8,hfloat,32,2) [] = { 0xfffffff0, 0xffffffff };
+ VECT_VAR_DECL(expected_f32_9,hfloat,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
+ VECT_VAR_DECL(expected_f32_10,hfloat,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
++VECT_VAR_DECL(expected_f32_11,hfloat,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
+ 
+ /* Expected results for vreinterpretq_f32_xx.  */
+ VECT_VAR_DECL(expected_q_f32_1,hfloat,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
+@@ -404,8 +526,10 @@ VECT_VAR_DECL(expected_q_f32_9,hfloat,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
+ 						   0xfbfaf9f8, 0xfffefdfc };
+ VECT_VAR_DECL(expected_q_f32_10,hfloat,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
+ 						    0xfff5fff4, 0xfff7fff6 };
++VECT_VAR_DECL(expected_q_f32_11,hfloat,32,4) [] = { 0xcb80cc00, 0xca80cb00,
++						    0xc980ca00, 0xc880c900 };
+ 
+-/* Expected results for vreinterpretq_xx_f32.  */
++/* Expected results for vreinterpret_xx_f32.  */
+ VECT_VAR_DECL(expected_xx_f32_1,int,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
+ 						0x0, 0x0, 0x70, 0xc1 };
+ VECT_VAR_DECL(expected_xx_f32_2,int,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
+@@ -419,6 +543,7 @@ VECT_VAR_DECL(expected_xx_f32_8,uint,64,1) [] = { 0xc1700000c1800000 };
+ VECT_VAR_DECL(expected_xx_f32_9,poly,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
+ 						 0x0, 0x0, 0x70, 0xc1 };
+ VECT_VAR_DECL(expected_xx_f32_10,poly,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
++VECT_VAR_DECL(expected_xx_f32_11,hfloat,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
+ 
+ /* Expected results for vreinterpretq_xx_f32.  */
+ VECT_VAR_DECL(expected_q_xx_f32_1,int,8,16) [] = { 0x0, 0x0, 0x80, 0xc1,
+@@ -447,6 +572,62 @@ VECT_VAR_DECL(expected_q_xx_f32_9,poly,8,16) [] = { 0x0, 0x0, 0x80, 0xc1,
+ 						    0x0, 0x0, 0x50, 0xc1 };
+ VECT_VAR_DECL(expected_q_xx_f32_10,poly,16,8) [] = { 0x0, 0xc180, 0x0, 0xc170,
+ 						     0x0, 0xc160, 0x0, 0xc150 };
++VECT_VAR_DECL(expected_q_xx_f32_11,hfloat,16,8) [] = { 0x0, 0xc180, 0x0, 0xc170,
++						      0x0, 0xc160, 0x0, 0xc150 };
++
++/* Expected results for vreinterpret_f16_xx.  */
++VECT_VAR_DECL(expected_f16_1,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
++VECT_VAR_DECL(expected_f16_2,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
++VECT_VAR_DECL(expected_f16_3,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
++VECT_VAR_DECL(expected_f16_4,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
++VECT_VAR_DECL(expected_f16_5,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
++VECT_VAR_DECL(expected_f16_6,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
++VECT_VAR_DECL(expected_f16_7,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
++VECT_VAR_DECL(expected_f16_8,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
++VECT_VAR_DECL(expected_f16_9,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
++VECT_VAR_DECL(expected_f16_10,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
++
++/* Expected results for vreinterpretq_f16_xx.  */
++VECT_VAR_DECL(expected_q_f16_1,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
++						   0xf5f4, 0xf7f6,
++						   0xf9f8, 0xfbfa,
++						   0xfdfc, 0xfffe };
++VECT_VAR_DECL(expected_q_f16_2,hfloat,16,8) [] = { 0xfff0, 0xfff1,
++						   0xfff2, 0xfff3,
++						   0xfff4, 0xfff5,
++						   0xfff6, 0xfff7 };
++VECT_VAR_DECL(expected_q_f16_3,hfloat,16,8) [] = { 0xfff0, 0xffff,
++						   0xfff1, 0xffff,
++						   0xfff2, 0xffff,
++						   0xfff3, 0xffff };
++VECT_VAR_DECL(expected_q_f16_4,hfloat,16,8) [] = { 0xfff0, 0xffff,
++						   0xffff, 0xffff,
++						   0xfff1, 0xffff,
++						   0xffff, 0xffff };
++VECT_VAR_DECL(expected_q_f16_5,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
++						   0xf5f4, 0xf7f6,
++						   0xf9f8, 0xfbfa,
++						   0xfdfc, 0xfffe };
++VECT_VAR_DECL(expected_q_f16_6,hfloat,16,8) [] = { 0xfff0, 0xfff1,
++						   0xfff2, 0xfff3,
++						   0xfff4, 0xfff5,
++						   0xfff6, 0xfff7 };
++VECT_VAR_DECL(expected_q_f16_7,hfloat,16,8) [] = { 0xfff0, 0xffff,
++						   0xfff1, 0xffff,
++						   0xfff2, 0xffff,
++						   0xfff3, 0xffff };
++VECT_VAR_DECL(expected_q_f16_8,hfloat,16,8) [] = { 0xfff0, 0xffff,
++						   0xffff, 0xffff,
++						   0xfff1, 0xffff,
++						   0xffff, 0xffff };
++VECT_VAR_DECL(expected_q_f16_9,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
++						   0xf5f4, 0xf7f6,
++						   0xf9f8, 0xfbfa,
++						   0xfdfc, 0xfffe };
++VECT_VAR_DECL(expected_q_f16_10,hfloat,16,8) [] = { 0xfff0, 0xfff1,
++						    0xfff2, 0xfff3,
++						    0xfff4, 0xfff5,
++						    0xfff6, 0xfff7 };
+ 
+ #define TEST_MSG "VREINTERPRET/VREINTERPRETQ"
+ 
+@@ -484,7 +665,9 @@ void exec_vreinterpret (void)
+ 
+   /* Initialize input "vector" from "buffer".  */
+   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
++  VLOAD(vector, buffer, , float, f, 16, 4);
+   VLOAD(vector, buffer, , float, f, 32, 2);
++  VLOAD(vector, buffer, q, float, f, 16, 8);
+   VLOAD(vector, buffer, q, float, f, 32, 4);
+ 
+   /* vreinterpret_s8_xx.  */
+@@ -497,6 +680,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, int, s, 8, 8, uint, u, 64, 1, expected_s8_7);
+   TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 8, 8, expected_s8_8);
+   TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 16, 4, expected_s8_9);
++  TEST_VREINTERPRET(, int, s, 8, 8, float, f, 16, 4, expected_s8_10);
+ 
+   /* vreinterpret_s16_xx.  */
+   TEST_VREINTERPRET(, int, s, 16, 4, int, s, 8, 8, expected_s16_1);
+@@ -508,6 +692,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, int, s, 16, 4, uint, u, 64, 1, expected_s16_7);
+   TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 8, 8, expected_s16_8);
+   TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 16, 4, expected_s16_9);
++  TEST_VREINTERPRET(, int, s, 16, 4, float, f, 16, 4, expected_s16_10);
+ 
+   /* vreinterpret_s32_xx.  */
+   TEST_VREINTERPRET(, int, s, 32, 2, int, s, 8, 8, expected_s32_1);
+@@ -519,6 +704,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, int, s, 32, 2, uint, u, 64, 1, expected_s32_7);
+   TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 8, 8, expected_s32_8);
+   TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 16, 4, expected_s32_9);
++  TEST_VREINTERPRET(, int, s, 32, 2, float, f, 16, 4, expected_s32_10);
+ 
+   /* vreinterpret_s64_xx.  */
+   TEST_VREINTERPRET(, int, s, 64, 1, int, s, 8, 8, expected_s64_1);
+@@ -530,6 +716,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, int, s, 64, 1, uint, u, 64, 1, expected_s64_7);
+   TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 8, 8, expected_s64_8);
+   TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 16, 4, expected_s64_9);
++  TEST_VREINTERPRET(, int, s, 64, 1, float, f, 16, 4, expected_s64_10);
+ 
+   /* vreinterpret_u8_xx.  */
+   TEST_VREINTERPRET(, uint, u, 8, 8, int, s, 8, 8, expected_u8_1);
+@@ -541,6 +728,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, uint, u, 8, 8, uint, u, 64, 1, expected_u8_7);
+   TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 8, 8, expected_u8_8);
+   TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 16, 4, expected_u8_9);
++  TEST_VREINTERPRET(, uint, u, 8, 8, float, f, 16, 4, expected_u8_10);
+ 
+   /* vreinterpret_u16_xx.  */
+   TEST_VREINTERPRET(, uint, u, 16, 4, int, s, 8, 8, expected_u16_1);
+@@ -552,6 +740,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, uint, u, 16, 4, uint, u, 64, 1, expected_u16_7);
+   TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 8, 8, expected_u16_8);
+   TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 16, 4, expected_u16_9);
++  TEST_VREINTERPRET(, uint, u, 16, 4, float, f, 16, 4, expected_u16_10);
+ 
+   /* vreinterpret_u32_xx.  */
+   TEST_VREINTERPRET(, uint, u, 32, 2, int, s, 8, 8, expected_u32_1);
+@@ -563,6 +752,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, uint, u, 32, 2, uint, u, 64, 1, expected_u32_7);
+   TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 8, 8, expected_u32_8);
+   TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 16, 4, expected_u32_9);
++  TEST_VREINTERPRET(, uint, u, 32, 2, float, f, 16, 4, expected_u32_10);
+ 
+   /* vreinterpret_u64_xx.  */
+   TEST_VREINTERPRET(, uint, u, 64, 1, int, s, 8, 8, expected_u64_1);
+@@ -574,6 +764,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, uint, u, 64, 1, uint, u, 32, 2, expected_u64_7);
+   TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 8, 8, expected_u64_8);
+   TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 16, 4, expected_u64_9);
++  TEST_VREINTERPRET(, uint, u, 64, 1, float, f, 16, 4, expected_u64_10);
+ 
+   /* vreinterpret_p8_xx.  */
+   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, int, s, 8, 8, expected_p8_1);
+@@ -585,6 +776,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, uint, u, 32, 2, expected_p8_7);
+   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, uint, u, 64, 1, expected_p8_8);
+   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, poly, p, 16, 4, expected_p8_9);
++  TEST_VREINTERPRET_POLY(, poly, p, 8, 8, float, f, 16, 4, expected_p8_10);
+ 
+   /* vreinterpret_p16_xx.  */
+   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, int, s, 8, 8, expected_p16_1);
+@@ -596,6 +788,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, uint, u, 32, 2, expected_p16_7);
+   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, uint, u, 64, 1, expected_p16_8);
+   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, poly, p, 8, 8, expected_p16_9);
++  TEST_VREINTERPRET_POLY(, poly, p, 16, 4, float, f, 16, 4, expected_p16_10);
+ 
+   /* vreinterpretq_s8_xx.  */
+   TEST_VREINTERPRET(q, int, s, 8, 16, int, s, 16, 8, expected_q_s8_1);
+@@ -607,6 +800,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, int, s, 8, 16, uint, u, 64, 2, expected_q_s8_7);
+   TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 8, 16, expected_q_s8_8);
+   TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 16, 8, expected_q_s8_9);
++  TEST_VREINTERPRET(q, int, s, 8, 16, float, f, 16, 8, expected_q_s8_10);
+ 
+   /* vreinterpretq_s16_xx.  */
+   TEST_VREINTERPRET(q, int, s, 16, 8, int, s, 8, 16, expected_q_s16_1);
+@@ -618,6 +812,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, int, s, 16, 8, uint, u, 64, 2, expected_q_s16_7);
+   TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 8, 16, expected_q_s16_8);
+   TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 16, 8, expected_q_s16_9);
++  TEST_VREINTERPRET(q, int, s, 16, 8, float, f, 16, 8, expected_q_s16_10);
+ 
+   /* vreinterpretq_s32_xx.  */
+   TEST_VREINTERPRET(q, int, s, 32, 4, int, s, 8, 16, expected_q_s32_1);
+@@ -629,6 +824,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, int, s, 32, 4, uint, u, 64, 2, expected_q_s32_7);
+   TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 8, 16, expected_q_s32_8);
+   TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 16, 8, expected_q_s32_9);
++  TEST_VREINTERPRET(q, int, s, 32, 4, float, f, 16, 8, expected_q_s32_10);
+ 
+   /* vreinterpretq_s64_xx.  */
+   TEST_VREINTERPRET(q, int, s, 64, 2, int, s, 8, 16, expected_q_s64_1);
+@@ -640,6 +836,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, int, s, 64, 2, uint, u, 64, 2, expected_q_s64_7);
+   TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 8, 16, expected_q_s64_8);
+   TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 16, 8, expected_q_s64_9);
++  TEST_VREINTERPRET(q, int, s, 64, 2, float, f, 16, 8, expected_q_s64_10);
+ 
+   /* vreinterpretq_u8_xx.  */
+   TEST_VREINTERPRET(q, uint, u, 8, 16, int, s, 8, 16, expected_q_u8_1);
+@@ -651,6 +848,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, uint, u, 8, 16, uint, u, 64, 2, expected_q_u8_7);
+   TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 8, 16, expected_q_u8_8);
+   TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 16, 8, expected_q_u8_9);
++  TEST_VREINTERPRET(q, uint, u, 8, 16, float, f, 16, 8, expected_q_u8_10);
+ 
+   /* vreinterpretq_u16_xx.  */
+   TEST_VREINTERPRET(q, uint, u, 16, 8, int, s, 8, 16, expected_q_u16_1);
+@@ -662,6 +860,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, uint, u, 16, 8, uint, u, 64, 2, expected_q_u16_7);
+   TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 8, 16, expected_q_u16_8);
+   TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 16, 8, expected_q_u16_9);
++  TEST_VREINTERPRET(q, uint, u, 16, 8, float, f, 16, 8, expected_q_u16_10);
+ 
+   /* vreinterpretq_u32_xx.  */
+   TEST_VREINTERPRET(q, uint, u, 32, 4, int, s, 8, 16, expected_q_u32_1);
+@@ -673,6 +872,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, uint, u, 32, 4, uint, u, 64, 2, expected_q_u32_7);
+   TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 8, 16, expected_q_u32_8);
+   TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 16, 8, expected_q_u32_9);
++  TEST_VREINTERPRET(q, uint, u, 32, 4, float, f, 16, 8, expected_q_u32_10);
+ 
+   /* vreinterpretq_u64_xx.  */
+   TEST_VREINTERPRET(q, uint, u, 64, 2, int, s, 8, 16, expected_q_u64_1);
+@@ -684,6 +884,31 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, uint, u, 64, 2, uint, u, 32, 4, expected_q_u64_7);
+   TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 8, 16, expected_q_u64_8);
+   TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 16, 8, expected_q_u64_9);
++  TEST_VREINTERPRET(q, uint, u, 64, 2, float, f, 16, 8, expected_q_u64_10);
++
++  /* vreinterpretq_p8_xx.  */
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 8, 16, expected_q_p8_1);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 16, 8, expected_q_p8_2);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 32, 4, expected_q_p8_3);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 64, 2, expected_q_p8_4);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 8, 16, expected_q_p8_5);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 16, 8, expected_q_p8_6);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 32, 4, expected_q_p8_7);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 64, 2, expected_q_p8_8);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, poly, p, 16, 8, expected_q_p8_9);
++  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, float, f, 16, 8, expected_q_p8_10);
++
++  /* vreinterpretq_p16_xx.  */
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 8, 16, expected_q_p16_1);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 16, 8, expected_q_p16_2);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 32, 4, expected_q_p16_3);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 64, 2, expected_q_p16_4);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 8, 16, expected_q_p16_5);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 16, 8, expected_q_p16_6);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 32, 4, expected_q_p16_7);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 64, 2, expected_q_p16_8);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, poly, p, 8, 16, expected_q_p16_9);
++  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, float, f, 16, 8, expected_q_p16_10);
+ 
+   /* vreinterpret_f32_xx.  */
+   TEST_VREINTERPRET_FP(, float, f, 32, 2, int, s, 8, 8, expected_f32_1);
+@@ -696,6 +921,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET_FP(, float, f, 32, 2, uint, u, 64, 1, expected_f32_8);
+   TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 8, 8, expected_f32_9);
+   TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 16, 4, expected_f32_10);
++  TEST_VREINTERPRET_FP(, float, f, 32, 2, float, f, 16, 4, expected_f32_11);
+ 
+   /* vreinterpretq_f32_xx.  */
+   TEST_VREINTERPRET_FP(q, float, f, 32, 4, int, s, 8, 16, expected_q_f32_1);
+@@ -708,6 +934,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET_FP(q, float, f, 32, 4, uint, u, 64, 2, expected_q_f32_8);
+   TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 8, 16, expected_q_f32_9);
+   TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 16, 8, expected_q_f32_10);
++  TEST_VREINTERPRET_FP(q, float, f, 32, 4, float, f, 16, 8, expected_q_f32_11);
+ 
+   /* vreinterpret_xx_f32.  */
+   TEST_VREINTERPRET(, int, s, 8, 8, float, f, 32, 2, expected_xx_f32_1);
+@@ -720,6 +947,7 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(, uint, u, 64, 1, float, f, 32, 2, expected_xx_f32_8);
+   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, float, f, 32, 2, expected_xx_f32_9);
+   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, float, f, 32, 2, expected_xx_f32_10);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, float, f, 32, 2, expected_xx_f32_11);
+ 
+   /* vreinterpretq_xx_f32.  */
+   TEST_VREINTERPRET(q, int, s, 8, 16, float, f, 32, 4, expected_q_xx_f32_1);
+@@ -732,6 +960,31 @@ void exec_vreinterpret (void)
+   TEST_VREINTERPRET(q, uint, u, 64, 2, float, f, 32, 4, expected_q_xx_f32_8);
+   TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, float, f, 32, 4, expected_q_xx_f32_9);
+   TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, float, f, 32, 4, expected_q_xx_f32_10);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, float, f, 32, 4, expected_q_xx_f32_11);
++
++  /* vreinterpret_f16_xx.  */
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 8, 8, expected_f16_1);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 16, 4, expected_f16_2);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 32, 2, expected_f16_3);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 64, 1, expected_f16_4);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 8, 8, expected_f16_5);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 16, 4, expected_f16_6);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 32, 2, expected_f16_7);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 64, 1, expected_f16_8);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 8, 8, expected_f16_9);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 16, 4, expected_f16_10);
++
++  /* vreinterpretq_f16_xx.  */
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 8, 16, expected_q_f16_1);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 16, 8, expected_q_f16_2);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 32, 4, expected_q_f16_3);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 64, 2, expected_q_f16_4);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 8, 16, expected_q_f16_5);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 16, 8, expected_q_f16_6);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 32, 4, expected_q_f16_7);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 64, 2, expected_q_f16_8);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 8, 16, expected_q_f16_9);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 16, 8, expected_q_f16_10);
+ }
+ 
+ int main (void)
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p128.c
+@@ -0,0 +1,160 @@
++/* This file contains tests for the vreinterpret *p128 intrinsics.  */
++
++/* { dg-require-effective-target arm_crypto_ok } */
++/* { dg-add-options arm_crypto } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results: vreinterpretq_p128_*.  */
++VECT_VAR_DECL(vreint_expected_q_p128_s8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
++							  0xfffefdfcfbfaf9f8 };
++VECT_VAR_DECL(vreint_expected_q_p128_s16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
++							   0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(vreint_expected_q_p128_s32,poly,64,2) [] = { 0xfffffff1fffffff0,
++							   0xfffffff3fffffff2 };
++VECT_VAR_DECL(vreint_expected_q_p128_s64,poly,64,2) [] = { 0xfffffffffffffff0,
++							   0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_p128_u8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
++							  0xfffefdfcfbfaf9f8 };
++VECT_VAR_DECL(vreint_expected_q_p128_u16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
++							   0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(vreint_expected_q_p128_u32,poly,64,2) [] = { 0xfffffff1fffffff0,
++							   0xfffffff3fffffff2 };
++VECT_VAR_DECL(vreint_expected_q_p128_u64,poly,64,2) [] = { 0xfffffffffffffff0,
++							   0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_p128_p8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
++							  0xfffefdfcfbfaf9f8 };
++VECT_VAR_DECL(vreint_expected_q_p128_p16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
++							   0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(vreint_expected_q_p128_f32,poly,64,2) [] = { 0xc1700000c1800000,
++							   0xc1500000c1600000 };
++VECT_VAR_DECL(vreint_expected_q_p128_f16,poly,64,2) [] = { 0xca80cb00cb80cc00,
++							   0xc880c900c980ca00 };
++
++/* Expected results: vreinterpretq_*_p128.  */
++VECT_VAR_DECL(vreint_expected_q_s8_p128,int,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++							 0xff, 0xff, 0xff, 0xff,
++							 0xf1, 0xff, 0xff, 0xff,
++							 0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_q_s16_p128,int,16,8) [] = { 0xfff0, 0xffff,
++							  0xffff, 0xffff,
++							  0xfff1, 0xffff,
++							  0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_q_s32_p128,int,32,4) [] = { 0xfffffff0, 0xffffffff,
++							  0xfffffff1, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_q_s64_p128,int,64,2) [] = { 0xfffffffffffffff0,
++							  0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_u8_p128,uint,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++							  0xff, 0xff, 0xff, 0xff,
++							  0xf1, 0xff, 0xff, 0xff,
++							  0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_q_u16_p128,uint,16,8) [] = { 0xfff0, 0xffff,
++							   0xffff, 0xffff,
++							   0xfff1, 0xffff,
++							   0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_q_u32_p128,uint,32,4) [] = { 0xfffffff0, 0xffffffff,
++							   0xfffffff1, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_q_u64_p128,uint,64,2) [] = { 0xfffffffffffffff0,
++							   0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_p8_p128,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++							  0xff, 0xff, 0xff, 0xff,
++							  0xf1, 0xff, 0xff, 0xff,
++							  0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_q_p16_p128,poly,16,8) [] = { 0xfff0, 0xffff,
++							   0xffff, 0xffff,
++							   0xfff1, 0xffff,
++							   0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_q_p64_p128,uint,64,2) [] = { 0xfffffffffffffff0,
++							   0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_f32_p128,hfloat,32,4) [] = { 0xfffffff0, 0xffffffff,
++							     0xfffffff1, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_q_f16_p128,hfloat,16,8) [] = { 0xfff0, 0xffff,
++							     0xffff, 0xffff,
++							     0xfff1, 0xffff,
++							     0xffff, 0xffff };
++
++int main (void)
++{
++  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector);
++  DECL_VARIABLE(vreint_vector, poly, 64, 2);
++  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector_res);
++  DECL_VARIABLE(vreint_vector_res, poly, 64, 2);
++
++  clean_results ();
++
++  TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vreint_vector, buffer);
++  VLOAD(vreint_vector, buffer, q, poly, p, 64, 2);
++  VLOAD(vreint_vector, buffer, q, float, f, 16, 8);
++  VLOAD(vreint_vector, buffer, q, float, f, 32, 4);
++
++  /* vreinterpretq_p128_* tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VREINTERPRETQ_P128_*"
++
++  /* Since there is no way to store a poly128_t value, convert to
++     poly64x2_t before storing. This means that we are not able to
++     test vreinterpretq_p128* alone, and that errors in
++     vreinterpretq_p64_p128 could compensate for errors in
++     vreinterpretq_p128*.  */
++#define TEST_VREINTERPRET128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
++  VECT_VAR(vreint_vector_res, poly, 64, 2) =  vreinterpretq_p64_p128(	\
++    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS))); \
++  vst1##Q##_##T2##64(VECT_VAR(result, poly, 64, 2),			\
++                     VECT_VAR(vreint_vector_res, poly, 64, 2));		\
++  CHECK(TEST_MSG, T1, 64, 2, PRIx##64, EXPECTED, "");
++
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 8, 16, vreint_expected_q_p128_s8);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 16, 8, vreint_expected_q_p128_s16);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 32, 4, vreint_expected_q_p128_s32);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 64, 2, vreint_expected_q_p128_s64);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 8, 16, vreint_expected_q_p128_u8);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 16, 8, vreint_expected_q_p128_u16);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 32, 4, vreint_expected_q_p128_u32);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 64, 2, vreint_expected_q_p128_u64);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, poly, p, 8, 16, vreint_expected_q_p128_p8);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, poly, p, 16, 8, vreint_expected_q_p128_p16);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, float, f, 16, 8, vreint_expected_q_p128_f16);
++  TEST_VREINTERPRET128(q, poly, p, 128, 1, float, f, 32, 4, vreint_expected_q_p128_f32);
++
++  /* vreinterpretq_*_p128 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VREINTERPRETQ_*_P128"
++
++  /* Since there is no way to load a poly128_t value, load a
++     poly64x2_t and convert it to poly128_t. This means that we are
++     not able to test vreinterpretq_*_p128 alone, and that errors in
++     vreinterpretq_p128_p64 could compensate for errors in
++     vreinterpretq_*_p128*.  */
++#define TEST_VREINTERPRET_FROM_P128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
++  VECT_VAR(vreint_vector_res, T1, W, N) =				\
++    vreinterpret##Q##_##T2##W##_##TS2##WS(				\
++  vreinterpretq_p128_p64(VECT_VAR(vreint_vector, TS1, 64, 2)));		\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
++		    VECT_VAR(vreint_vector_res, T1, W, N));		\
++  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
++
++#define TEST_VREINTERPRET_FP_FROM_P128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
++  VECT_VAR(vreint_vector_res, T1, W, N) =				\
++    vreinterpret##Q##_##T2##W##_##TS2##WS(				\
++  vreinterpretq_p128_p64(VECT_VAR(vreint_vector, TS1, 64, 2)));		\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
++		    VECT_VAR(vreint_vector_res, T1, W, N));		\
++  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
++
++  TEST_VREINTERPRET_FROM_P128(q, int, s, 8, 16, poly, p, 128, 1, vreint_expected_q_s8_p128);
++  TEST_VREINTERPRET_FROM_P128(q, int, s, 16, 8, poly, p, 128, 1, vreint_expected_q_s16_p128);
++  TEST_VREINTERPRET_FROM_P128(q, int, s, 32, 4, poly, p, 128, 1, vreint_expected_q_s32_p128);
++  TEST_VREINTERPRET_FROM_P128(q, int, s, 64, 2, poly, p, 128, 1, vreint_expected_q_s64_p128);
++  TEST_VREINTERPRET_FROM_P128(q, uint, u, 8, 16, poly, p, 128, 1, vreint_expected_q_u8_p128);
++  TEST_VREINTERPRET_FROM_P128(q, uint, u, 16, 8, poly, p, 128, 1, vreint_expected_q_u16_p128);
++  TEST_VREINTERPRET_FROM_P128(q, uint, u, 32, 4, poly, p, 128, 1, vreint_expected_q_u32_p128);
++  TEST_VREINTERPRET_FROM_P128(q, uint, u, 64, 2, poly, p, 128, 1, vreint_expected_q_u64_p128);
++  TEST_VREINTERPRET_FROM_P128(q, poly, p, 8, 16, poly, p, 128, 1, vreint_expected_q_p8_p128);
++  TEST_VREINTERPRET_FROM_P128(q, poly, p, 16, 8, poly, p, 128, 1, vreint_expected_q_p16_p128);
++  TEST_VREINTERPRET_FP_FROM_P128(q, float, f, 16, 8, poly, p, 128, 1, vreint_expected_q_f16_p128);
++  TEST_VREINTERPRET_FP_FROM_P128(q, float, f, 32, 4, poly, p, 128, 1, vreint_expected_q_f32_p128);
++
++  return 0;
++}
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p64.c
+@@ -0,0 +1,202 @@
++/* This file contains tests for the vreinterpret *p64 intrinsics.  */
++
++/* { dg-require-effective-target arm_crypto_ok } */
++/* { dg-add-options arm_crypto } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results: vreinterpret_p64_*.  */
++VECT_VAR_DECL(vreint_expected_p64_s8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
++VECT_VAR_DECL(vreint_expected_p64_s16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
++VECT_VAR_DECL(vreint_expected_p64_s32,poly,64,1) [] = { 0xfffffff1fffffff0 };
++VECT_VAR_DECL(vreint_expected_p64_s64,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vreint_expected_p64_u8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
++VECT_VAR_DECL(vreint_expected_p64_u16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
++VECT_VAR_DECL(vreint_expected_p64_u32,poly,64,1) [] = { 0xfffffff1fffffff0 };
++VECT_VAR_DECL(vreint_expected_p64_u64,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vreint_expected_p64_p8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
++VECT_VAR_DECL(vreint_expected_p64_p16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
++VECT_VAR_DECL(vreint_expected_p64_f32,poly,64,1) [] = { 0xc1700000c1800000 };
++VECT_VAR_DECL(vreint_expected_p64_f16,poly,64,1) [] = { 0xca80cb00cb80cc00 };
++
++/* Expected results: vreinterpretq_p64_*.  */
++VECT_VAR_DECL(vreint_expected_q_p64_s8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
++							 0xfffefdfcfbfaf9f8 };
++VECT_VAR_DECL(vreint_expected_q_p64_s16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
++							  0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(vreint_expected_q_p64_s32,poly,64,2) [] = { 0xfffffff1fffffff0,
++							  0xfffffff3fffffff2 };
++VECT_VAR_DECL(vreint_expected_q_p64_s64,poly,64,2) [] = { 0xfffffffffffffff0,
++							  0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_p64_u8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
++							 0xfffefdfcfbfaf9f8 };
++VECT_VAR_DECL(vreint_expected_q_p64_u16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
++							  0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(vreint_expected_q_p64_u32,poly,64,2) [] = { 0xfffffff1fffffff0,
++							  0xfffffff3fffffff2 };
++VECT_VAR_DECL(vreint_expected_q_p64_u64,poly,64,2) [] = { 0xfffffffffffffff0,
++							  0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_p64_p8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
++							 0xfffefdfcfbfaf9f8 };
++VECT_VAR_DECL(vreint_expected_q_p64_p16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
++							  0xfff7fff6fff5fff4 };
++VECT_VAR_DECL(vreint_expected_q_p64_f32,poly,64,2) [] = { 0xc1700000c1800000,
++							  0xc1500000c1600000 };
++VECT_VAR_DECL(vreint_expected_q_p64_f16,poly,64,2) [] = { 0xca80cb00cb80cc00,
++							  0xc880c900c980ca00 };
++
++/* Expected results: vreinterpret_*_p64.  */
++VECT_VAR_DECL(vreint_expected_s8_p64,int,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
++						     0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_s16_p64,int,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_s32_p64,int,32,2) [] = { 0xfffffff0, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_s64_p64,int,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vreint_expected_u8_p64,uint,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
++						      0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_u16_p64,uint,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_u32_p64,uint,32,2) [] = { 0xfffffff0, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_u64_p64,uint,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vreint_expected_p8_p64,poly,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
++						      0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_p16_p64,poly,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_f32_p64,hfloat,32,2) [] = { 0xfffffff0, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_f16_p64,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
++
++/* Expected results: vreinterpretq_*_p64.  */
++VECT_VAR_DECL(vreint_expected_q_s8_p64,int,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++							0xff, 0xff, 0xff, 0xff,
++							0xf1, 0xff, 0xff, 0xff,
++							0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_q_s16_p64,int,16,8) [] = { 0xfff0, 0xffff,
++							 0xffff, 0xffff,
++							 0xfff1, 0xffff,
++							 0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_q_s32_p64,int,32,4) [] = { 0xfffffff0, 0xffffffff,
++							 0xfffffff1, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_q_s64_p64,int,64,2) [] = { 0xfffffffffffffff0,
++							 0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_u8_p64,uint,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++							 0xff, 0xff, 0xff, 0xff,
++							 0xf1, 0xff, 0xff, 0xff,
++							 0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_q_u16_p64,uint,16,8) [] = { 0xfff0, 0xffff,
++							  0xffff, 0xffff,
++							  0xfff1, 0xffff,
++							  0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_q_u32_p64,uint,32,4) [] = { 0xfffffff0, 0xffffffff,
++							  0xfffffff1, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_q_u64_p64,uint,64,2) [] = { 0xfffffffffffffff0,
++							  0xfffffffffffffff1 };
++VECT_VAR_DECL(vreint_expected_q_p8_p64,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
++							 0xff, 0xff, 0xff, 0xff,
++							 0xf1, 0xff, 0xff, 0xff,
++							 0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(vreint_expected_q_p16_p64,poly,16,8) [] = { 0xfff0, 0xffff,
++							  0xffff, 0xffff,
++							  0xfff1, 0xffff,
++							  0xffff, 0xffff };
++VECT_VAR_DECL(vreint_expected_q_f32_p64,hfloat,32,4) [] = { 0xfffffff0, 0xffffffff,
++							    0xfffffff1, 0xffffffff };
++VECT_VAR_DECL(vreint_expected_q_f16_p64,hfloat,16,8) [] = { 0xfff0, 0xffff,
++							    0xffff, 0xffff,
++							    0xfff1, 0xffff,
++							    0xffff, 0xffff };
++
++int main (void)
++{
++#define TEST_VREINTERPRET(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED)	\
++  VECT_VAR(vreint_vector_res, T1, W, N) =				\
++    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
++		    VECT_VAR(vreint_vector_res, T1, W, N));		\
++  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
++
++#define TEST_VREINTERPRET_FP(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
++  VECT_VAR(vreint_vector_res, T1, W, N) =				\
++    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
++		    VECT_VAR(vreint_vector_res, T1, W, N));		\
++  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
++
++  DECL_VARIABLE_ALL_VARIANTS(vreint_vector);
++  DECL_VARIABLE(vreint_vector, poly, 64, 1);
++  DECL_VARIABLE(vreint_vector, poly, 64, 2);
++  DECL_VARIABLE_ALL_VARIANTS(vreint_vector_res);
++  DECL_VARIABLE(vreint_vector_res, poly, 64, 1);
++  DECL_VARIABLE(vreint_vector_res, poly, 64, 2);
++
++  clean_results ();
++
++  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vreint_vector, buffer);
++  VLOAD(vreint_vector, buffer, , poly, p, 64, 1);
++  VLOAD(vreint_vector, buffer, q, poly, p, 64, 2);
++  VLOAD(vreint_vector, buffer, , float, f, 16, 4);
++  VLOAD(vreint_vector, buffer, q, float, f, 16, 8);
++  VLOAD(vreint_vector, buffer, , float, f, 32, 2);
++  VLOAD(vreint_vector, buffer, q, float, f, 32, 4);
++
++  /* vreinterpret_p64_* tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VREINTERPRET_P64_*"
++  TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 8, 8, vreint_expected_p64_s8);
++  TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 16, 4, vreint_expected_p64_s16);
++  TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 32, 2, vreint_expected_p64_s32);
++  TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 64, 1, vreint_expected_p64_s64);
++  TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 8, 8, vreint_expected_p64_u8);
++  TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 16, 4, vreint_expected_p64_u16);
++  TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 32, 2, vreint_expected_p64_u32);
++  TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 64, 1, vreint_expected_p64_u64);
++  TEST_VREINTERPRET(, poly, p, 64, 1, poly, p, 8, 8, vreint_expected_p64_p8);
++  TEST_VREINTERPRET(, poly, p, 64, 1, poly, p, 16, 4, vreint_expected_p64_p16);
++  TEST_VREINTERPRET(, poly, p, 64, 1, float, f, 16, 4, vreint_expected_p64_f16);
++  TEST_VREINTERPRET(, poly, p, 64, 1, float, f, 32, 2, vreint_expected_p64_f32);
++
++  /* vreinterpretq_p64_* tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VREINTERPRETQ_P64_*"
++  TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 8, 16, vreint_expected_q_p64_s8);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 16, 8, vreint_expected_q_p64_s16);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 32, 4, vreint_expected_q_p64_s32);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 64, 2, vreint_expected_q_p64_s64);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 8, 16, vreint_expected_q_p64_u8);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 16, 8, vreint_expected_q_p64_u16);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 32, 4, vreint_expected_q_p64_u32);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 64, 2, vreint_expected_q_p64_u64);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, poly, p, 8, 16, vreint_expected_q_p64_p8);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, poly, p, 16, 8, vreint_expected_q_p64_p16);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, float, f, 16, 8, vreint_expected_q_p64_f16);
++  TEST_VREINTERPRET(q, poly, p, 64, 2, float, f, 32, 4, vreint_expected_q_p64_f32);
++
++  /* vreinterpret_*_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VREINTERPRET_*_P64"
++
++  TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 64, 1, vreint_expected_s8_p64);
++  TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 64, 1, vreint_expected_s16_p64);
++  TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 64, 1, vreint_expected_s32_p64);
++  TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 64, 1, vreint_expected_s64_p64);
++  TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 64, 1, vreint_expected_u8_p64);
++  TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 64, 1, vreint_expected_u16_p64);
++  TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 64, 1, vreint_expected_u32_p64);
++  TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 64, 1, vreint_expected_u64_p64);
++  TEST_VREINTERPRET(, poly, p, 8, 8, poly, p, 64, 1, vreint_expected_p8_p64);
++  TEST_VREINTERPRET(, poly, p, 16, 4, poly, p, 64, 1, vreint_expected_p16_p64);
++  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 64, 1, vreint_expected_f16_p64);
++  TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 64, 1, vreint_expected_f32_p64);
++  TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 64, 2, vreint_expected_q_s8_p64);
++  TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 64, 2, vreint_expected_q_s16_p64);
++  TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 64, 2, vreint_expected_q_s32_p64);
++  TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 64, 2, vreint_expected_q_s64_p64);
++  TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 64, 2, vreint_expected_q_u8_p64);
++  TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 64, 2, vreint_expected_q_u16_p64);
++  TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 64, 2, vreint_expected_q_u32_p64);
++  TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 64, 2, vreint_expected_q_u64_p64);
++  TEST_VREINTERPRET(q, poly, p, 8, 16, poly, p, 64, 2, vreint_expected_q_p8_p64);
++  TEST_VREINTERPRET(q, poly, p, 16, 8, poly, p, 64, 2, vreint_expected_q_p16_p64);
++  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 64, 2, vreint_expected_q_f16_p64);
++  TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 64, 2, vreint_expected_q_f32_p64);
++
++  return 0;
++}
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrnd.c
+@@ -0,0 +1,16 @@
++/* { dg-require-effective-target arm_v8_neon_ok } */
++/* { dg-add-options arm_v8_neon } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results.  */
++VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
++VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
++					       0xc1600000, 0xc1500000 };
++
++#define INSN vrnd
++#define TEST_MSG "VRND"
++
++#include "vrndX.inc"
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndX.inc
+@@ -0,0 +1,43 @@
++#define FNNAME1(NAME) exec_ ## NAME
++#define FNNAME(NAME) FNNAME1 (NAME)
++
++void FNNAME (INSN) (void)
++{
++  /* vector_res = vrndX (vector), then store the result.  */
++#define TEST_VRND2(INSN, Q, T1, T2, W, N)				\
++  VECT_VAR (vector_res, T1, W, N) =					\
++    INSN##Q##_##T2##W (VECT_VAR (vector, T1, W, N));			\
++    vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N),			\
++		       VECT_VAR (vector_res, T1, W, N))
++
++  /* Two auxliary macros are necessary to expand INSN.  */
++#define TEST_VRND1(INSN, Q, T1, T2, W, N)	\
++  TEST_VRND2 (INSN, Q, T1, T2, W, N)
++
++#define TEST_VRND(Q, T1, T2, W, N)		\
++  TEST_VRND1 (INSN, Q, T1, T2, W, N)
++
++  DECL_VARIABLE (vector, float, 32, 2);
++  DECL_VARIABLE (vector, float, 32, 4);
++
++  DECL_VARIABLE (vector_res, float, 32, 2);
++  DECL_VARIABLE (vector_res, float, 32, 4);
++
++  clean_results ();
++
++  VLOAD (vector, buffer, , float, f, 32, 2);
++  VLOAD (vector, buffer, q, float, f, 32, 4);
++
++  TEST_VRND ( , float, f, 32, 2);
++  TEST_VRND (q, float, f, 32, 4);
++
++  CHECK_FP (TEST_MSG, float, 32, 2, PRIx32, expected, "");
++  CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected, "");
++}
++
++int
++main (void)
++{
++  FNNAME (INSN) ();
++  return 0;
++}
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrnda.c
+@@ -0,0 +1,16 @@
++/* { dg-require-effective-target arm_v8_neon_ok } */
++/* { dg-add-options arm_v8_neon } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results.  */
++VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
++VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
++					       0xc1600000, 0xc1500000 };
++
++#define INSN vrnda
++#define TEST_MSG "VRNDA"
++
++#include "vrndX.inc"
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndm.c
+@@ -0,0 +1,16 @@
++/* { dg-require-effective-target arm_v8_neon_ok } */
++/* { dg-add-options arm_v8_neon } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results.  */
++VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
++VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
++					       0xc1600000, 0xc1500000 };
++
++#define INSN vrndm
++#define TEST_MSG "VRNDM"
++
++#include "vrndX.inc"
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndn.c
+@@ -0,0 +1,16 @@
++/* { dg-require-effective-target arm_v8_neon_ok } */
++/* { dg-add-options arm_v8_neon } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results.  */
++VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
++VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
++					       0xc1600000, 0xc1500000 };
++
++#define INSN vrndn
++#define TEST_MSG "VRNDN"
++
++#include "vrndX.inc"
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndp.c
+@@ -0,0 +1,16 @@
++/* { dg-require-effective-target arm_v8_neon_ok } */
++/* { dg-add-options arm_v8_neon } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results.  */
++VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
++VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
++					       0xc1600000, 0xc1500000 };
++
++#define INSN vrndp
++#define TEST_MSG "VRNDP"
++
++#include "vrndX.inc"
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndx.c
+@@ -0,0 +1,16 @@
++/* { dg-require-effective-target arm_v8_neon_ok } */
++/* { dg-add-options arm_v8_neon } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++#include "compute-ref-data.h"
++
++/* Expected results.  */
++VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
++VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
++					       0xc1600000, 0xc1500000 };
++
++#define INSN vrndx
++#define TEST_MSG "VRNDX"
++
++#include "vrndX.inc"
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
+@@ -101,10 +101,8 @@ VECT_VAR_DECL(expected_negative_shift,uint,64,2) [] = { 0x7ffffffffffffff,
+ 							0x7ffffffffffffff };
+ 
+ 
+-#ifndef INSN_NAME
+ #define INSN_NAME vshl
+ #define TEST_MSG "VSHL/VSHLQ"
+-#endif
+ 
+ #define FNNAME1(NAME) exec_ ## NAME
+ #define FNNAME(NAME) FNNAME1(NAME)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
+@@ -166,9 +166,11 @@ void vsli_extra(void)
+   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
++  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
++  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
+ }
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
+@@ -14,6 +14,7 @@ VECT_VAR_DECL(expected_st2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
+ VECT_VAR_DECL(expected_st2_0,poly,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
+ VECT_VAR_DECL(expected_st2_0,int,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -24,6 +25,8 @@ VECT_VAR_DECL(expected_st2_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
+ 					       0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st2_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0x0, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
+ 						 0x0, 0x0 };
+ 
+@@ -39,6 +42,7 @@ VECT_VAR_DECL(expected_st2_1,uint,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st2_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_1,hfloat,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -48,6 +52,8 @@ VECT_VAR_DECL(expected_st2_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ VECT_VAR_DECL(expected_st2_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st2_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st2_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ 
+ /* Expected results for vst3, chunk 0.  */
+@@ -62,6 +68,7 @@ VECT_VAR_DECL(expected_st3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
+ VECT_VAR_DECL(expected_st3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0 };
++VECT_VAR_DECL(expected_st3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0 };
+ VECT_VAR_DECL(expected_st3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
+ VECT_VAR_DECL(expected_st3_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -73,6 +80,8 @@ VECT_VAR_DECL(expected_st3_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
+ 					       0xfffffff2, 0x0 };
+ VECT_VAR_DECL(expected_st3_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st3_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
+ 						 0xc1600000, 0x0 };
+ 
+@@ -88,6 +97,7 @@ VECT_VAR_DECL(expected_st3_1,uint,32,2) [] = { 0xfffffff2, 0x0 };
+ VECT_VAR_DECL(expected_st3_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st3_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_1,hfloat,32,2) [] = { 0xc1600000, 0x0 };
+ VECT_VAR_DECL(expected_st3_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -97,6 +107,8 @@ VECT_VAR_DECL(expected_st3_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ VECT_VAR_DECL(expected_st3_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st3_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ 
+ /* Expected results for vst3, chunk 2.  */
+@@ -111,6 +123,7 @@ VECT_VAR_DECL(expected_st3_2,uint,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st3_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_2,hfloat,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -120,6 +133,8 @@ VECT_VAR_DECL(expected_st3_2,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ VECT_VAR_DECL(expected_st3_2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_2,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st3_2,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st3_2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ 
+ /* Expected results for vst4, chunk 0.  */
+@@ -134,6 +149,7 @@ VECT_VAR_DECL(expected_st4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
+ VECT_VAR_DECL(expected_st4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
++VECT_VAR_DECL(expected_st4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
+ VECT_VAR_DECL(expected_st4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
+ VECT_VAR_DECL(expected_st4_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -145,6 +161,8 @@ VECT_VAR_DECL(expected_st4_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
+ 					       0xfffffff2, 0xfffffff3 };
+ VECT_VAR_DECL(expected_st4_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
+ 						 0xc1600000, 0xc1500000 };
+ 
+@@ -160,6 +178,7 @@ VECT_VAR_DECL(expected_st4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
+ VECT_VAR_DECL(expected_st4_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
+ VECT_VAR_DECL(expected_st4_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -169,6 +188,8 @@ VECT_VAR_DECL(expected_st4_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ VECT_VAR_DECL(expected_st4_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ 
+ /* Expected results for vst4, chunk 2.  */
+@@ -183,6 +204,7 @@ VECT_VAR_DECL(expected_st4_2,uint,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_2,hfloat,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -192,6 +214,8 @@ VECT_VAR_DECL(expected_st4_2,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ VECT_VAR_DECL(expected_st4_2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_2,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_2,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
++					       0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ 
+ /* Expected results for vst4, chunk 3.  */
+@@ -206,6 +230,7 @@ VECT_VAR_DECL(expected_st4_3,uint,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_3,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_3,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_3,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_3,hfloat,32,2) [] = { 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_3,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					      0x0, 0x0, 0x0, 0x0 };
+@@ -215,6 +240,8 @@ VECT_VAR_DECL(expected_st4_3,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ VECT_VAR_DECL(expected_st4_3,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_3,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 					       0x0, 0x0, 0x0, 0x0 };
++VECT_VAR_DECL(expected_st4_3,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
++						 0x0, 0x0, 0x0, 0x0 };
+ VECT_VAR_DECL(expected_st4_3,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+ 
+ /* Declare additional input buffers as needed.  */
+@@ -229,6 +256,7 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
+ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
+ VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
+ VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
++VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
+ VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
+ 
+ /* Input buffers for vld3_lane.  */
+@@ -242,6 +270,7 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
+ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
+ VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
+ VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
++VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
+ VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
+ 
+ /* Input buffers for vld4_lane.  */
+@@ -255,6 +284,7 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
+ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
+ VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
+ VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
++VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
+ VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
+ 
+ void exec_vstX_lane (void)
+@@ -302,7 +332,7 @@ void exec_vstX_lane (void)
+ 
+   /* We need all variants in 64 bits, but there is no 64x2 variant,
+      nor 128 bits vectors of int8/uint8/poly8.  */
+-#define DECL_ALL_VSTX_LANE(X)			\
++#define DECL_ALL_VSTX_LANE_NO_FP16(X)		\
+   DECL_VSTX_LANE(int, 8, 8, X);			\
+   DECL_VSTX_LANE(int, 16, 4, X);		\
+   DECL_VSTX_LANE(int, 32, 2, X);		\
+@@ -319,11 +349,20 @@ void exec_vstX_lane (void)
+   DECL_VSTX_LANE(poly, 16, 8, X);		\
+   DECL_VSTX_LANE(float, 32, 4, X)
+ 
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++#define DECL_ALL_VSTX_LANE(X)		\
++  DECL_ALL_VSTX_LANE_NO_FP16(X);	\
++  DECL_VSTX_LANE(float, 16, 4, X);	\
++  DECL_VSTX_LANE(float, 16, 8, X)
++#else
++#define DECL_ALL_VSTX_LANE(X) DECL_ALL_VSTX_LANE_NO_FP16(X)
++#endif
++
+ #define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
+ 
+   /* Use the same lanes regardless of the size of the array (X), for
+      simplicity.  */
+-#define TEST_ALL_VSTX_LANE(X)			\
++#define TEST_ALL_VSTX_LANE_NO_FP16(X)		\
+   TEST_VSTX_LANE(, int, s, 8, 8, X, 7);		\
+   TEST_VSTX_LANE(, int, s, 16, 4, X, 2);	\
+   TEST_VSTX_LANE(, int, s, 32, 2, X, 0);	\
+@@ -340,7 +379,16 @@ void exec_vstX_lane (void)
+   TEST_VSTX_LANE(q, poly, p, 16, 8, X, 5);	\
+   TEST_VSTX_LANE(q, float, f, 32, 4, X, 2)
+ 
+-#define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++#define TEST_ALL_VSTX_LANE(X)			\
++  TEST_ALL_VSTX_LANE_NO_FP16(X);		\
++  TEST_VSTX_LANE(, float, f, 16, 4, X, 2);	\
++  TEST_VSTX_LANE(q, float, f, 16, 8, X, 6)
++#else
++#define TEST_ALL_VSTX_LANE(X) TEST_ALL_VSTX_LANE_NO_FP16(X)
++#endif
++
++#define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)	\
+   TEST_EXTRA_CHUNK(int, 8, 8, X, Y);		\
+   TEST_EXTRA_CHUNK(int, 16, 4, X, Y);		\
+   TEST_EXTRA_CHUNK(int, 32, 2, X, Y);		\
+@@ -357,6 +405,15 @@ void exec_vstX_lane (void)
+   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
+ 
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++#define TEST_ALL_EXTRA_CHUNKS(X,Y)		\
++  TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y);		\
++  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
++  TEST_EXTRA_CHUNK(float, 16, 8, X, Y)
++#else
++#define TEST_ALL_EXTRA_CHUNKS(X,Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
++#endif
++
+   /* Declare the temporary buffers / variables.  */
+   DECL_ALL_VSTX_LANE(2);
+   DECL_ALL_VSTX_LANE(3);
+@@ -371,12 +428,18 @@ void exec_vstX_lane (void)
+   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
+   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
+   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
++#endif
+   DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
+   DUMMY_ARRAY(buffer_src, int, 16, 8, 4);
+   DUMMY_ARRAY(buffer_src, int, 32, 4, 4);
+   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
+   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
+   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
++#endif
+   DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
+ 
+   /* Check vst2_lane/vst2q_lane.  */
+@@ -400,6 +463,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_0, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_0, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_0, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st2_0, CMT);
++#endif
+ 
+   TEST_ALL_EXTRA_CHUNKS(2, 1);
+ #undef CMT
+@@ -419,6 +486,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_1, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_1, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_1, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st2_1, CMT);
++#endif
+ 
+ 
+   /* Check vst3_lane/vst3q_lane.  */
+@@ -444,6 +515,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_0, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_0, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_0, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_0, CMT);
++#endif
+ 
+   TEST_ALL_EXTRA_CHUNKS(3, 1);
+ 
+@@ -464,6 +539,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_1, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_1, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_1, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_1, CMT);
++#endif
+ 
+   TEST_ALL_EXTRA_CHUNKS(3, 2);
+ 
+@@ -484,6 +563,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_2, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_2, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_2, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_2, CMT);
++#endif
+ 
+ 
+   /* Check vst4_lane/vst4q_lane.  */
+@@ -509,6 +592,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_0, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_0, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_0, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_0, CMT);
++#endif
+ 
+   TEST_ALL_EXTRA_CHUNKS(4, 1);
+ 
+@@ -529,6 +616,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_1, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_1, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_1, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_1, CMT);
++#endif
+ 
+   TEST_ALL_EXTRA_CHUNKS(4, 2);
+ 
+@@ -549,6 +640,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_2, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_2, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_2, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_2, CMT);
++#endif
+ 
+   TEST_ALL_EXTRA_CHUNKS(4, 3);
+ 
+@@ -569,6 +664,10 @@ void exec_vstX_lane (void)
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_3, CMT);
+   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
+   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_3, CMT);
++#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
++  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_3, CMT);
++  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_3, CMT);
++#endif
+ }
+ 
+ int main (void)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
+@@ -32,10 +32,21 @@ VECT_VAR_DECL(expected_unsigned,uint,16,8) [] = { 0x0, 0xffff,
+ VECT_VAR_DECL(expected_unsigned,uint,32,4) [] = { 0x0, 0xffffffff,
+ 						  0x0, 0xffffffff };
+ 
+-#ifndef INSN_NAME
++/* Expected results with poly input.  */
++VECT_VAR_DECL(expected_poly,uint,8,8) [] = { 0x0, 0xff, 0xff, 0xff,
++					     0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(expected_poly,uint,8,16) [] = { 0x0, 0xff, 0xff, 0xff,
++					      0xff, 0xff, 0xff, 0xff,
++					      0xff, 0xff, 0xff, 0xff,
++					      0xff, 0xff, 0xff, 0xff };
++VECT_VAR_DECL(expected_poly,uint,16,4) [] = { 0x0, 0xffff, 0x0, 0xffff };
++VECT_VAR_DECL(expected_poly,uint,16,8) [] = { 0x0, 0xffff,
++					      0x0, 0xffff,
++					      0xffff, 0xffff,
++					      0xffff, 0xffff };
++
+ #define INSN_NAME vtst
+ #define TEST_MSG "VTST/VTSTQ"
+-#endif
+ 
+ /* We can't use the standard ref_v_binary_op.c template because vtst
+    has no 64 bits variant, and outputs are always of uint type.  */
+@@ -73,12 +84,16 @@ FNNAME (INSN_NAME)
+   VDUP(vector2, , uint, u, 8, 8, 15);
+   VDUP(vector2, , uint, u, 16, 4, 5);
+   VDUP(vector2, , uint, u, 32, 2, 1);
++  VDUP(vector2, , poly, p, 8, 8, 15);
++  VDUP(vector2, , poly, p, 16, 4, 5);
+   VDUP(vector2, q, int, s, 8, 16, 15);
+   VDUP(vector2, q, int, s, 16, 8, 5);
+   VDUP(vector2, q, int, s, 32, 4, 1);
+   VDUP(vector2, q, uint, u, 8, 16, 15);
+   VDUP(vector2, q, uint, u, 16, 8, 5);
+   VDUP(vector2, q, uint, u, 32, 4, 1);
++  VDUP(vector2, q, poly, p, 8, 16, 15);
++  VDUP(vector2, q, poly, p, 16, 8, 5);
+ 
+ #define TEST_MACRO_NO64BIT_VARIANT_1_5(MACRO, VAR, T1, T2)	\
+   MACRO(VAR, , T1, T2, 8, 8);					\
+@@ -111,6 +126,18 @@ FNNAME (INSN_NAME)
+   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_unsigned, CMT);
+   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_unsigned, CMT);
+   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_unsigned, CMT);
++
++  /* Now, test the variants with poly8 and poly16 as input.  */
++#undef CMT
++#define CMT " (poly input)"
++  TEST_BINARY_OP(INSN_NAME, , poly, p, 8, 8);
++  TEST_BINARY_OP(INSN_NAME, , poly, p, 16, 4);
++  TEST_BINARY_OP(INSN_NAME, q, poly, p, 8, 16);
++  TEST_BINARY_OP(INSN_NAME, q, poly, p, 16, 8);
++  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_poly, CMT);
++  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_poly, CMT);
++  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_poly, CMT);
++  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_poly, CMT);
+ }
+ 
+ int main (void)
+--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
+@@ -1,4 +1,5 @@
+ /* { dg-error "unknown" "" {target "aarch64*-*-*" } } */
++/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
+ /* { dg-options "-O2 -mcpu=dummy" } */
+ 
+ void f ()
+--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
+@@ -1,4 +1,5 @@
+ /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
++/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
+ /* { dg-options "-O2 -mcpu=cortex-a53+no" } */
+ 
+ void f ()
+--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
+@@ -1,4 +1,5 @@
+ /* { dg-error "invalid feature" "" {target "aarch64*-*-*" } } */
++/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
+ /* { dg-options "-O2 -mcpu=cortex-a53+dummy" } */
+ 
+ void f ()
+--- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
+@@ -1,4 +1,5 @@
+ /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
++/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
+ /* { dg-options "-O2 -mcpu=+dummy" } */
+ 
+ void f ()
+--- a/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+@@ -110,6 +110,6 @@ main (int argc, char **argv)
+ /* vfmaq_lane_f64.
+    vfma_laneq_f64.
+    vfmaq_laneq_f64.  */
+-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
++/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
+ 
+ 
+--- a/src/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+@@ -111,6 +111,6 @@ main (int argc, char **argv)
+ /* vfmsq_lane_f64.
+    vfms_laneq_f64.
+    vfmsq_laneq_f64.  */
+-/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
++/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
+ 
+ 
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
+@@ -0,0 +1,541 @@
++/* Test the vmul_n_f64 AArch64 SIMD intrinsic.  */
++
++/* { dg-do run } */
++/* { dg-options "-O2 --save-temps" } */
++
++#include "arm_neon.h"
++
++extern void abort (void);
++
++#define A (132.4f)
++#define B (-0.0f)
++#define C (-34.8f)
++#define D (289.34f)
++float32_t expected2_1[2] = {A * A, B * A};
++float32_t expected2_2[2] = {A * B, B * B};
++float32_t expected4_1[4] = {A * A, B * A, C * A, D * A};
++float32_t expected4_2[4] = {A * B, B * B, C * B, D * B};
++float32_t expected4_3[4] = {A * C, B * C, C * C, D * C};
++float32_t expected4_4[4] = {A * D, B * D, C * D, D * D};
++float32_t _elemA = A;
++float32_t _elemB = B;
++float32_t _elemC = C;
++float32_t _elemD = D;
++
++#define AD (1234.5)
++#define BD (-0.0)
++#define CD (71.3)
++#define DD (-1024.4)
++float64_t expectedd2_1[2] = {AD * CD, BD * CD};
++float64_t expectedd2_2[2] = {AD * DD, BD * DD};
++float64_t _elemdC = CD;
++float64_t _elemdD = DD;
++
++
++#define AS (1024)
++#define BS (-31)
++#define CS (0)
++#define DS (655)
++int32_t expecteds2_1[2] = {AS * AS, BS * AS};
++int32_t expecteds2_2[2] = {AS * BS, BS * BS};
++int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS};
++int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS};
++int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS};
++int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS};
++int32_t _elemsA = AS;
++int32_t _elemsB = BS;
++int32_t _elemsC = CS;
++int32_t _elemsD = DS;
++
++#define AH ((int16_t) 0)
++#define BH ((int16_t) -32)
++#define CH ((int16_t) 102)
++#define DH ((int16_t) -51)
++#define EH ((int16_t) 71)
++#define FH ((int16_t) -91)
++#define GH ((int16_t) 48)
++#define HH ((int16_t) 255)
++int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH};
++int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH};
++int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH};
++int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH};
++int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH,
++			   EH * AH, FH * AH, GH * AH, HH * AH};
++int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH,
++			   EH * BH, FH * BH, GH * BH, HH * BH};
++int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH,
++			   EH * CH, FH * CH, GH * CH, HH * CH};
++int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH,
++			   EH * DH, FH * DH, GH * DH, HH * DH};
++int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH,
++			   EH * EH, FH * EH, GH * EH, HH * EH};
++int16_t expectedh8_6[8] = {AH * FH, BH * FH, CH * FH, DH * FH,
++			   EH * FH, FH * FH, GH * FH, HH * FH};
++int16_t expectedh8_7[8] = {AH * GH, BH * GH, CH * GH, DH * GH,
++			   EH * GH, FH * GH, GH * GH, HH * GH};
++int16_t expectedh8_8[8] = {AH * HH, BH * HH, CH * HH, DH * HH,
++			   EH * HH, FH * HH, GH * HH, HH * HH};
++int16_t _elemhA = AH;
++int16_t _elemhB = BH;
++int16_t _elemhC = CH;
++int16_t _elemhD = DH;
++int16_t _elemhE = EH;
++int16_t _elemhF = FH;
++int16_t _elemhG = GH;
++int16_t _elemhH = HH;
++
++#define AUS (1024)
++#define BUS (31)
++#define CUS (0)
++#define DUS (655)
++uint32_t expectedus2_1[2] = {AUS * AUS, BUS * AUS};
++uint32_t expectedus2_2[2] = {AUS * BUS, BUS * BUS};
++uint32_t expectedus4_1[4] = {AUS * AUS, BUS * AUS, CUS * AUS, DUS * AUS};
++uint32_t expectedus4_2[4] = {AUS * BUS, BUS * BUS, CUS * BUS, DUS * BUS};
++uint32_t expectedus4_3[4] = {AUS * CUS, BUS * CUS, CUS * CUS, DUS * CUS};
++uint32_t expectedus4_4[4] = {AUS * DUS, BUS * DUS, CUS * DUS, DUS * DUS};
++uint32_t _elemusA = AUS;
++uint32_t _elemusB = BUS;
++uint32_t _elemusC = CUS;
++uint32_t _elemusD = DUS;
++
++#define AUH ((uint16_t) 0)
++#define BUH ((uint16_t) 32)
++#define CUH ((uint16_t) 102)
++#define DUH ((uint16_t) 51)
++#define EUH ((uint16_t) 71)
++#define FUH ((uint16_t) 91)
++#define GUH ((uint16_t) 48)
++#define HUH ((uint16_t) 255)
++uint16_t expecteduh4_1[4] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH};
++uint16_t expecteduh4_2[4] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH};
++uint16_t expecteduh4_3[4] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH};
++uint16_t expecteduh4_4[4] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH};
++uint16_t expecteduh8_1[8] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH,
++			     EUH * AUH, FUH * AUH, GUH * AUH, HUH * AUH};
++uint16_t expecteduh8_2[8] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH,
++			     EUH * BUH, FUH * BUH, GUH * BUH, HUH * BUH};
++uint16_t expecteduh8_3[8] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH,
++			     EUH * CUH, FUH * CUH, GUH * CUH, HUH * CUH};
++uint16_t expecteduh8_4[8] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH,
++			     EUH * DUH, FUH * DUH, GUH * DUH, HUH * DUH};
++uint16_t expecteduh8_5[8] = {AUH * EUH, BUH * EUH, CUH * EUH, DUH * EUH,
++			     EUH * EUH, FUH * EUH, GUH * EUH, HUH * EUH};
++uint16_t expecteduh8_6[8] = {AUH * FUH, BUH * FUH, CUH * FUH, DUH * FUH,
++			     EUH * FUH, FUH * FUH, GUH * FUH, HUH * FUH};
++uint16_t expecteduh8_7[8] = {AUH * GUH, BUH * GUH, CUH * GUH, DUH * GUH,
++			     EUH * GUH, FUH * GUH, GUH * GUH, HUH * GUH};
++uint16_t expecteduh8_8[8] = {AUH * HUH, BUH * HUH, CUH * HUH, DUH * HUH,
++			     EUH * HUH, FUH * HUH, GUH * HUH, HUH * HUH};
++uint16_t _elemuhA = AUH;
++uint16_t _elemuhB = BUH;
++uint16_t _elemuhC = CUH;
++uint16_t _elemuhD = DUH;
++uint16_t _elemuhE = EUH;
++uint16_t _elemuhF = FUH;
++uint16_t _elemuhG = GUH;
++uint16_t _elemuhH = HUH;
++
++void
++check_v2sf (float32_t elemA, float32_t elemB)
++{
++  int32_t indx;
++  const float32_t vec32x2_buf[2] = {A, B};
++  float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf);
++  float32_t vec32x2_res[2];
++
++  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemA));
++
++  for (indx = 0; indx < 2; indx++)
++    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx])
++      abort ();
++
++  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemB));
++
++  for (indx = 0; indx < 2; indx++)
++    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 2 } } */
++}
++
++void
++check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD)
++{
++  int32_t indx;
++  const float32_t vec32x4_buf[4] = {A, B, C, D};
++  float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf);
++  float32_t vec32x4_res[4];
++
++  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemA));
++
++  for (indx = 0; indx < 4; indx++)
++    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx])
++      abort ();
++
++  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemB));
++
++  for (indx = 0; indx < 4; indx++)
++    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx])
++      abort ();
++
++  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemC));
++
++  for (indx = 0; indx < 4; indx++)
++    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx])
++      abort ();
++
++  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemD));
++
++  for (indx = 0; indx < 4; indx++)
++    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
++}
++
++void
++check_v2df (float64_t elemdC, float64_t elemdD)
++{
++  int32_t indx;
++  const float64_t vec64x2_buf[2] = {AD, BD};
++  float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf);
++  float64_t vec64x2_res[2];
++
++  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdC));
++
++  for (indx = 0; indx < 2; indx++)
++    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx])
++      abort ();
++
++  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdD));
++
++  for (indx = 0; indx < 2; indx++)
++    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[0\\\]" 2 } } */
++}
++
++void
++check_v2si (int32_t elemsA, int32_t elemsB)
++{
++  int32_t indx;
++  const int32_t vecs32x2_buf[2] = {AS, BS};
++  int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf);
++  int32_t vecs32x2_res[2];
++
++  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsA));
++
++  for (indx = 0; indx < 2; indx++)
++    if (vecs32x2_res[indx] != expecteds2_1[indx])
++      abort ();
++
++  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsB));
++
++  for (indx = 0; indx < 2; indx++)
++    if (vecs32x2_res[indx] != expecteds2_2[indx])
++      abort ();
++}
++
++void
++check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB)
++{
++  int indx;
++  const uint32_t vecus32x2_buf[2] = {AUS, BUS};
++  uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf);
++  uint32_t vecus32x2_res[2];
++
++  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusA));
++
++  for (indx = 0; indx < 2; indx++)
++    if (vecus32x2_res[indx] != expectedus2_1[indx])
++      abort ();
++
++  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusB));
++
++  for (indx = 0; indx < 2; indx++)
++    if (vecus32x2_res[indx] != expectedus2_2[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
++}
++
++void
++check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD)
++{
++  int32_t indx;
++  const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS};
++  int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf);
++  int32_t vecs32x4_res[4];
++
++  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsA));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecs32x4_res[indx] != expecteds4_1[indx])
++      abort ();
++
++  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsB));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecs32x4_res[indx] != expecteds4_2[indx])
++      abort ();
++
++  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsC));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecs32x4_res[indx] != expecteds4_3[indx])
++      abort ();
++
++  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsD));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecs32x4_res[indx] != expecteds4_4[indx])
++      abort ();
++}
++
++void
++check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC,
++		     uint32_t elemusD)
++{
++  int indx;
++  const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS};
++  uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf);
++  uint32_t vecus32x4_res[4];
++
++  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusA));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecus32x4_res[indx] != expectedus4_1[indx])
++      abort ();
++
++  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusB));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecus32x4_res[indx] != expectedus4_2[indx])
++      abort ();
++
++  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusC));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecus32x4_res[indx] != expectedus4_3[indx])
++      abort ();
++
++  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusD));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecus32x4_res[indx] != expectedus4_4[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 8 } } */
++}
++
++
++void
++check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD)
++{
++  int32_t indx;
++  const int16_t vech16x4_buf[4] = {AH, BH, CH, DH};
++  int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf);
++  int16_t vech16x4_res[4];
++
++  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhA));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vech16x4_res[indx] != expectedh4_1[indx])
++      abort ();
++
++  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhB));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vech16x4_res[indx] != expectedh4_2[indx])
++      abort ();
++
++  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhC));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vech16x4_res[indx] != expectedh4_3[indx])
++      abort ();
++
++  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhD));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vech16x4_res[indx] != expectedh4_4[indx])
++      abort ();
++}
++
++void
++check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
++		     uint16_t elemuhD)
++{
++  int indx;
++  const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH};
++  uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf);
++  uint16_t vecuh16x4_res[4];
++
++  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhA));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecuh16x4_res[indx] != expecteduh4_1[indx])
++      abort ();
++
++  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhB));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecuh16x4_res[indx] != expecteduh4_2[indx])
++      abort ();
++
++  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhC));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecuh16x4_res[indx] != expecteduh4_3[indx])
++      abort ();
++
++  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhD));
++
++  for (indx = 0; indx < 4; indx++)
++    if (vecuh16x4_res[indx] != expecteduh4_4[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.4h, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[0\\\]" 8 } } */
++}
++
++void
++check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD,
++	    int16_t elemhE, int16_t elemhF, int16_t elemhG, int16_t elemhH)
++{
++  int32_t indx;
++  const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH};
++  int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf);
++  int16_t vech16x8_res[8];
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhA));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_1[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhB));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_2[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhC));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_3[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhD));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_4[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhE));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_5[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhF));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_6[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhG));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_7[indx])
++      abort ();
++
++  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhH));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vech16x8_res[indx] != expectedh8_8[indx])
++      abort ();
++}
++
++void
++check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
++		     uint16_t elemuhD, uint16_t elemuhE, uint16_t elemuhF,
++		     uint16_t elemuhG, uint16_t elemuhH)
++{
++  int indx;
++  const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, HUH};
++  uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf);
++  uint16_t vecuh16x8_res[8];
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhA));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_1[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhB));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_2[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhC));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_3[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhD));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_4[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhE));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_5[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhF));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_6[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhG));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_7[indx])
++      abort ();
++
++  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhH));
++
++  for (indx = 0; indx < 8; indx++)
++    if (vecuh16x8_res[indx] != expecteduh8_8[indx])
++      abort ();
++
++/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[0\\\]" 16 } } */
++}
++
++int
++main (void)
++{
++  check_v2sf (_elemA, _elemB);
++  check_v4sf (_elemA, _elemB, _elemC, _elemD);
++  check_v2df (_elemdC, _elemdD);
++  check_v2si (_elemsA, _elemsB);
++  check_v4si (_elemsA, _elemsB, _elemsC, _elemsD);
++  check_v4hi (_elemhA, _elemhB, _elemhC, _elemhD);
++  check_v8hi (_elemhA, _elemhB, _elemhC, _elemhD,
++	      _elemhE, _elemhF, _elemhG, _elemhH);
++  check_v2si_unsigned (_elemusA, _elemusB);
++  check_v4si_unsigned (_elemusA, _elemusB, _elemusC, _elemusD);
++  check_v4hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD);
++  check_v8hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD,
++		       _elemuhE, _elemuhF, _elemuhG, _elemuhH);
++
++  return 0;
++}
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/struct_return.c
+@@ -0,0 +1,31 @@
++/* Test the absence of a spurious move from x8 to x0 for functions
++   return structures.  */
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++struct s
++{
++  long x;
++  long y;
++  long z;
++};
++
++struct s __attribute__((noinline))
++foo (long a, long d, long c)
++{
++  struct s b;
++  b.x = a;
++  b.y = d;
++  b.z = c;
++  return b;
++}
++
++int
++main (void)
++{
++  struct s x;
++  x = foo ( 10, 20, 30);
++  return x.x + x.y + x.z;
++}
++
++/* { dg-final { scan-assembler-not "mov\tx0, x8" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/armv5_thumb_isa.c
+@@ -0,0 +1,8 @@
++/* { dg-require-effective-target arm_arch_v5_ok } */
++/* { dg-add-options arm_arch_v5 } */
++
++#if __ARM_ARCH_ISA_THUMB
++#error "__ARM_ARCH_ISA_THUMB defined for ARMv5"
++#endif
++
++int foo;
+--- a/src/gcc/testsuite/lib/gcc-dg.exp
++++ b/src/gcc/testsuite/lib/gcc-dg.exp
+@@ -403,6 +403,7 @@ if { [info procs ${tool}_load] != [list] \
+ 	    switch [lindex $result 0] {
+ 		"pass" { set status "fail" }
+ 		"fail" { set status "pass" }
++		default { set status [lindex $result 0] }
+ 	    }
+ 	    set result [list $status [lindex $result 1]]
+ 	}
+--- a/src/gcc/tree-scalar-evolution.c
++++ b/src/gcc/tree-scalar-evolution.c
+@@ -1937,6 +1937,36 @@ interpret_rhs_expr (struct loop *loop, gimple *at_stmt,
+       res = chrec_convert (type, chrec1, at_stmt);
+       break;
+ 
++    case BIT_AND_EXPR:
++      /* Given int variable A, handle A&0xffff as (int)(unsigned short)A.
++	 If A is SCEV and its value is in the range of representable set
++	 of type unsigned short, the result expression is a (no-overflow)
++	 SCEV.  */
++      res = chrec_dont_know;
++      if (tree_fits_uhwi_p (rhs2))
++	{
++	  int precision;
++	  unsigned HOST_WIDE_INT val = tree_to_uhwi (rhs2);
++
++	  val ++;
++	  /* Skip if value of rhs2 wraps in unsigned HOST_WIDE_INT or
++	     it's not the maximum value of a smaller type than rhs1.  */
++	  if (val != 0
++	      && (precision = exact_log2 (val)) > 0
++	      && (unsigned) precision < TYPE_PRECISION (TREE_TYPE (rhs1)))
++	    {
++	      tree utype = build_nonstandard_integer_type (precision, 1);
++
++	      if (TYPE_PRECISION (utype) < TYPE_PRECISION (TREE_TYPE (rhs1)))
++		{
++		  chrec1 = analyze_scalar_evolution (loop, rhs1);
++		  chrec1 = chrec_convert (utype, chrec1, at_stmt);
++		  res = chrec_convert (TREE_TYPE (rhs1), chrec1, at_stmt);
++		}
++	    }
++	}
++      break;
++
+     default:
+       res = chrec_dont_know;
+       break;
+--- a/src/libgcc/config/arm/ieee754-df.S
++++ b/src/libgcc/config/arm/ieee754-df.S
+@@ -160,8 +160,8 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3
+ 	teq	r4, r5
+ 	beq	LSYM(Lad_d)
+ 
+-@ CFI note: we're lucky that the branches to Lad_* that appear after this function
+-@ have a CFI state that's exactly the same as the one we're in at this
++@ CFI note: we're lucky that the branches to Lad_* that appear after this
++@ function have a CFI state that's exactly the same as the one we're in at this
+ @ point. Otherwise the CFI would change to a different state after the branch,
+ @ which would be disastrous for backtracing.
+ LSYM(Lad_x):
+@@ -1158,8 +1158,8 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
+ 1:	str	ip, [sp, #-4]!
+ 	.cfi_adjust_cfa_offset 4        @ CFA is now sp + previousOffset + 4.
+ 	@ We're not adding CFI for ip as it's pushed into the stack
+-	@ only because @ it may be popped off later as a return value
+-	@ (i.e. we're not preserving @ it anyways).
++	@ only because it may be popped off later as a return value
++	@ (i.e. we're not preserving it anyways).
+ 
+ 	@ Trap any INF/NAN first.
+ 	mov	ip, xh, lsl #1
+@@ -1169,14 +1169,14 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
+ 	COND(mvn,s,ne)	ip, ip, asr #21
+ 	beq	3f
+ 	.cfi_remember_state
+-	@ Save the current CFI state. This is done because the branch
+-	@ is conditional, @ and if we don't take it we'll issue a
+-	@ .cfi_adjust_cfa_offset and return.  @ If we do take it,
+-	@ however, the .cfi_adjust_cfa_offset from the non-branch @ code
+-	@ will affect the branch code as well. To avoid this we'll
+-	@ restore @ the current state before executing the branch code.
+-
+-	@ Test for equality.  @ Note that 0.0 is equal to -0.0.
++	@ Save the current CFI state.  This is done because the branch
++	@ is conditional, and if we don't take it we'll issue a
++	@ .cfi_adjust_cfa_offset and return.  If we do take it,
++	@ however, the .cfi_adjust_cfa_offset from the non-branch code
++	@ will affect the branch code as well.  To avoid this we'll
++	@ restore the current state before executing the branch code.
++
++	@ Test for equality.  Note that 0.0 is equal to -0.0.
+ 2:	add	sp, sp, #4
+ 	.cfi_adjust_cfa_offset -4       @ CFA is now sp + previousOffset.
+ 
diff --git a/debian/rules.defs b/debian/rules.defs
index dd5e368..5853f09 100644
--- a/debian/rules.defs
+++ b/debian/rules.defs
@@ -381,8 +381,6 @@ ifeq ($(distribution),Ubuntu)
   endif
 endif
 
-with_linaro_branch =
-
 # build using fsf or the ibm branch
 ifeq ($(distribution),Ubuntu)
   ifneq (,$(findstring $(DEB_TARGET_ARCH),ppc64el))
diff --git a/debian/rules.patch b/debian/rules.patch
index 605e082..abff61b 100644
--- a/debian/rules.patch
+++ b/debian/rules.patch
@@ -86,12 +86,13 @@ debian_patches += \
 	gcc-SOURCE_DATE_EPOCH \
 	gcc-SOURCE_DATE_EPOCH-2 \
 	cmd-go-combine-gccgo-s-ld-and-ldShared-methods \
-	vulcan-cpu$(if $(with_linaro_branch),-linaro) \
+	vulcan-cpu \
 	vulcan-costs \
 	libjava-mips64el \
 	PR55947-revert \
 	pr68273 \
 
+#	vulcan-cpu$(if $(with_linaro_branch),-linaro) \
 # this is still needed on powerpc, e.g. firefox and insighttoolkit4 will ftbfs.
 ifneq (,$(filter $(DEB_TARGET_ARCH),powerpc))
   debian_patches += pr65913-workaround

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/gcc-6.git