[gcc-6] 344/401: * Update the Linaro support to the 6.3-2017.01 snapshot.

Wed Apr 5 15:50:36 UTC 2017

This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch pu/reproducible_builds
in repository gcc-6.

commit e2d9fb37faffa1593a58abd2dc85b9eb840ae002
Author: doko <doko at 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca>
Date:   Thu Jan 12 14:29:49 2017 +0000

      * Update the Linaro support to the 6.3-2017.01 snapshot.
    
    
    git-svn-id: svn://anonscm.debian.org/gcccvs/branches/sid/gcc-6@9224 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca
---
 debian/changelog                              |     1 +
 debian/patches/gcc-linaro-doc.diff            |   188 +-
 debian/patches/gcc-linaro-no-macros.diff      |     2 +-
 debian/patches/gcc-linaro-r243646-revert.diff |    88 -
 debian/patches/gcc-linaro-r244161-revert.diff |   117 -
 debian/patches/gcc-linaro-r244242-revert.diff |   184 +
 debian/patches/gcc-linaro.diff                | 28980 +++++++++++++++++-------
 debian/patches/linaro-issue2575.diff          |    16 -
 debian/rules.patch                            |     4 +-
 9 files changed, 20518 insertions(+), 9062 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 0527a2b..67d914c 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -15,6 +15,7 @@ gcc-6 (6.3.0-3) UNRELEASED; urgency=medium
   * Fix PR target/79044 (PPC), ICE (Bill Schmidt). Closes: #850777.
   * Don't add the configured prefix to libcc1's compiler name.
     Closes: #851146.
+  * Update the Linaro support to the 6.3-2017.01 snapshot.
 
  -- Matthias Klose <doko at debian.org>  Thu, 12 Jan 2017 13:04:12 +0100
 
diff --git a/debian/patches/gcc-linaro-doc.diff b/debian/patches/gcc-linaro-doc.diff
index f722c32..6b9f8a1 100644
--- a/debian/patches/gcc-linaro-doc.diff
+++ b/debian/patches/gcc-linaro-doc.diff
@@ -1,4 +1,4 @@
-# DP: Changes for the Linaro 6-2016.10 release (documentation).
+# DP: Changes for the Linaro 6-2017.01 release (documentation).
 
 --- a/src/gcc/doc/cpp.texi
 +++ b/src/gcc/doc/cpp.texi
@@ -11,6 +11,52 @@
  minor version and patch level are reset.  If you wish to use the
  predefined macros directly in the conditional, you will need to write it
  like this:
+--- a/src/gcc/doc/extend.texi
++++ b/src/gcc/doc/extend.texi
+@@ -11416,6 +11416,7 @@ instructions, but allow the compiler to schedule those calls.
+ * ARM iWMMXt Built-in Functions::
+ * ARM C Language Extensions (ACLE)::
+ * ARM Floating Point Status and Control Intrinsics::
++* ARM ARMv8-M Security Extensions::
+ * AVR Built-in Functions::
+ * Blackfin Built-in Functions::
+ * FR-V Built-in Functions::
+@@ -12260,6 +12261,35 @@ unsigned int __builtin_arm_get_fpscr ()
+ void __builtin_arm_set_fpscr (unsigned int)
+ @end smallexample
+ 
++ at node ARM ARMv8-M Security Extensions
++ at subsection ARM ARMv8-M Security Extensions
++
++GCC implements the ARMv8-M Security Extensions as described in the ARMv8-M
++Security Extensions: Requiremenets on Development Tools Engineering
++Specification, which can be found at
++ at uref{http://infocenter.arm.com/help/topic/com.arm.doc.ecm0359818/ECM0359818_armv8m_security_extensions_reqs_on_dev_tools_1_0.pdf}.
++
++As part of the Security Extensions GCC implements two new function attributes:
++ at code{cmse_nonsecure_entry} and @code{cmse_nonsecure_call}.
++
++As part of the Security Extensions GCC implements the intrinsics below.  FPTR
++is used here to mean any function pointer type.
++
++ at smallexample
++cmse_address_info_t cmse_TT (void *)
++cmse_address_info_t cmse_TT_fptr (FPTR)
++cmse_address_info_t cmse_TTT (void *)
++cmse_address_info_t cmse_TTT_fptr (FPTR)
++cmse_address_info_t cmse_TTA (void *)
++cmse_address_info_t cmse_TTA_fptr (FPTR)
++cmse_address_info_t cmse_TTAT (void *)
++cmse_address_info_t cmse_TTAT_fptr (FPTR)
++void * cmse_check_address_range (void *, size_t, int)
++typeof(p) cmse_nsfptr_create (FPTR p)
++intptr_t cmse_is_nsfptr (FPTR)
++int cmse_nonsecure_caller (void)
++ at end smallexample
++
+ @node AVR Built-in Functions
+ @subsection AVR Built-in Functions
+ 
 --- a/src/gcc/doc/fragments.texi
 +++ b/src/gcc/doc/fragments.texi
 @@ -156,15 +156,16 @@ variants.  And for some targets it is better to reuse an existing multilib
@@ -39,6 +85,78 @@
  sets up relations between two option sets rather than two options.  Here is an
  example to demo how we reuse libraries built in Thumb mode for applications built
  in ARM mode:
+--- a/src/gcc/doc/install.texi
++++ b/src/gcc/doc/install.texi
+@@ -1101,19 +1101,59 @@ sysv, aix.
+ 
+ @item --with-multilib-list=@var{list}
+ @itemx --without-multilib-list
+-Specify what multilibs to build.
+-Currently only implemented for arm*-*-*, sh*-*-* and x86-64-*-linux*.
++Specify what multilibs to build.  @var{list} is a comma separated list of
++values, possibly consisting of a single value.  Currently only implemented
++for arm*-*-*, sh*-*-* and x86-64-*-linux*.  The accepted values and meaning
++for each target is given below.
+ 
+ @table @code
+ @item arm*-*-*
+- at var{list} is either @code{default} or @code{aprofile}.  Specifying
+- at code{default} is equivalent to omitting this option while specifying
+- at code{aprofile} builds multilibs for each combination of ISA (@code{-marm} or
+- at code{-mthumb}), architecture (@code{-march=armv7-a}, @code{-march=armv7ve},
+-or @code{-march=armv8-a}), FPU available (none, @code{-mfpu=vfpv3-d16},
+- at code{-mfpu=neon}, @code{-mfpu=vfpv4-d16}, @code{-mfpu=neon-vfpv4} or
+- at code{-mfpu=neon-fp-armv8} depending on architecture) and floating-point ABI
+-(@code{-mfloat-abi=softfp} or @code{-mfloat-abi=hard}).
++ at var{list} is one of at code{default}, @code{aprofile} or @code{rmprofile}.
++Specifying @code{default} is equivalent to omitting this option, ie. only the
++default runtime library will be enabled.  Specifying @code{aprofile} or
++ at code{rmprofile} builds multilibs for a combination of ISA, architecture,
++FPU available and floating-point ABI.
++
++The table below gives the combination of ISAs, architectures, FPUs and
++floating-point ABIs for which multilibs are built for each accepted value.
++
++ at multitable @columnfractions .15 .28 .30
++ at item Option @tab aprofile @tab rmprofile
++ at item ISAs
++ at tab @code{-marm} and @code{-mthumb}
++ at tab @code{-mthumb}
++ at item Architectures@*@*@*@*@*@*
++ at tab default architecture@*
++ at code{-march=armv7-a}@*
++ at code{-march=armv7ve}@*
++ at code{-march=armv8-a}@*@*@*
++ at tab default architecture@*
++ at code{-march=armv6s-m}@*
++ at code{-march=armv7-m}@*
++ at code{-march=armv7e-m}@*
++ at code{-march=armv8-m.base}@*
++ at code{-march=armv8-m.main}@*
++ at code{-march=armv7}
++ at item FPUs@*@*@*@*@*
++ at tab none@*
++ at code{-mfpu=vfpv3-d16}@*
++ at code{-mfpu=neon}@*
++ at code{-mfpu=vfpv4-d16}@*
++ at code{-mfpu=neon-vfpv4}@*
++ at code{-mfpu=neon-fp-armv8}
++ at tab none@*
++ at code{-mfpu=vfpv3-d16}@*
++ at code{-mfpu=fpv4-sp-d16}@*
++ at code{-mfpu=fpv5-sp-d16}@*
++ at code{-mfpu=fpv5-d16}@*
++ at item floating-point@/ ABIs@*@*
++ at tab @code{-mfloat-abi=soft}@*
++ at code{-mfloat-abi=softfp}@*
++ at code{-mfloat-abi=hard}
++ at tab @code{-mfloat-abi=soft}@*
++ at code{-mfloat-abi=softfp}@*
++ at code{-mfloat-abi=hard}
++ at end multitable
+ 
+ @item sh*-*-*
+ @var{list} is a comma separated list of CPU names.  These must be of the
 --- a/src/gcc/doc/invoke.texi
 +++ b/src/gcc/doc/invoke.texi
 @@ -573,6 +573,8 @@ Objective-C and Objective-C++ Dialects}.
@@ -50,7 +168,17 @@
  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
  
  @emph{Adapteva Epiphany Options}
-@@ -9478,6 +9480,11 @@ Size of minimal partition for WHOPR (in estimated instructions).
+@@ -632,7 +634,8 @@ Objective-C and Objective-C++ Dialects}.
+ -mneon-for-64bits @gol
+ -mslow-flash-data @gol
+ -masm-syntax-unified @gol
+--mrestrict-it}
++-mrestrict-it @gol
++-mcmse}
+ 
+ @emph{AVR Options}
+ @gccoptlist{-mmcu=@var{mcu} -maccumulate-args -mbranch-cost=@var{cost} @gol
+@@ -9477,6 +9480,11 @@ Size of minimal partition for WHOPR (in estimated instructions).
  This prevents expenses of splitting very small programs into too many
  partitions.
  
@@ -62,7 +190,7 @@
  @item cxx-max-namespaces-for-diagnostic-help
  The maximum number of namespaces to consult for suggestions when C++
  name lookup fails for an identifier.  The default is 1000.
-@@ -12828,9 +12835,9 @@ These options are defined for AArch64 implementations:
+@@ -12827,9 +12835,9 @@ These options are defined for AArch64 implementations:
  @item -mabi=@var{name}
  @opindex mabi
  Generate code for the specified data model.  Permissible values
@@ -75,7 +203,7 @@
  
  The default depends on the specific target configuration.  Note that
  the LP64 and ILP32 ABIs are not link-compatible; you must compile your
-@@ -12855,25 +12862,24 @@ Generate little-endian code.  This is the default when GCC is configured for an
+@@ -12854,25 +12862,24 @@ Generate little-endian code.  This is the default when GCC is configured for an
  @item -mcmodel=tiny
  @opindex mcmodel=tiny
  Generate code for the tiny code model.  The program and its statically defined
@@ -108,7 +236,7 @@
  
  @item -momit-leaf-frame-pointer
  @itemx -mno-omit-leaf-frame-pointer
-@@ -12895,7 +12901,7 @@ of TLS variables.
+@@ -12894,7 +12901,7 @@ of TLS variables.
  @item -mtls-size=@var{size}
  @opindex mtls-size
  Specify bit size of immediate TLS offsets.  Valid values are 12, 24, 32, 48.
@@ -117,7 +245,7 @@
  
  @item -mfix-cortex-a53-835769
  @itemx -mno-fix-cortex-a53-835769
-@@ -12915,12 +12921,34 @@ corresponding flag to the linker.
+@@ -12914,12 +12921,34 @@ corresponding flag to the linker.
  
  @item -mlow-precision-recip-sqrt
  @item -mno-low-precision-recip-sqrt
@@ -158,7 +286,7 @@
  
  @item -march=@var{name}
  @opindex march
-@@ -12929,10 +12957,13 @@ more feature modifiers.  This option has the form
+@@ -12928,10 +12957,13 @@ more feature modifiers.  This option has the form
  @option{-march=@var{arch}@r{@{}+ at r{[}no at r{]}@var{feature}@r{@}*}}.
  
  The permissible values for @var{arch} are @samp{armv8-a},
@@ -174,7 +302,7 @@
  enables the @samp{+crc} and @samp{+lse} features.
  
  The value @samp{native} is available on native AArch64 GNU/Linux and
-@@ -12956,18 +12987,18 @@ processors implementing the target architecture.
+@@ -12955,18 +12987,18 @@ processors implementing the target architecture.
  Specify the name of the target processor for which GCC should tune the
  performance of the code.  Permissible values for this option are:
  @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a57},
@@ -202,7 +330,7 @@
  
  Where none of @option{-mtune=}, @option{-mcpu=} or @option{-march=}
  are specified, the code is tuned to perform well across a range
-@@ -12987,12 +13018,6 @@ documented in the sub-section on
+@@ -12986,12 +13018,6 @@ documented in the sub-section on
  Feature Modifiers}.  Where conflicting feature modifiers are
  specified, the right-most feature is used.
  
@@ -215,7 +343,7 @@
  GCC uses @var{name} to determine what kind of instructions it can emit when
  generating assembly code (as if by @option{-march}) and to determine
  the target processor for which to tune for performance (as if
-@@ -13010,11 +13035,11 @@ across releases.
+@@ -13009,11 +13035,11 @@ across releases.
  This option is only intended to be useful when developing GCC.
  
  @item -mpc-relative-literal-loads
@@ -232,7 +360,7 @@
  
  @end table
  
-@@ -13042,12 +13067,14 @@ instructions.  This is on by default for all possible values for options
+@@ -13041,12 +13067,14 @@ instructions.  This is on by default for all possible values for options
  @item lse
  Enable Large System Extension instructions.  This is on by default for
  @option{-march=armv8.1-a}.
@@ -250,7 +378,7 @@
  
  @node Adapteva Epiphany Options
  @subsection Adapteva Epiphany Options
-@@ -13967,21 +13994,42 @@ name to determine what kind of instructions it can emit when generating
+@@ -13966,21 +13994,42 @@ name to determine what kind of instructions it can emit when generating
  assembly code.  This option can be used in conjunction with or instead
  of the @option{-mcpu=} option.  Permissible names are: @samp{armv2},
  @samp{armv2a}, @samp{armv3}, @samp{armv3m}, @samp{armv4}, @samp{armv4t},
@@ -300,16 +428,19 @@
  @option{-march=native} causes the compiler to auto-detect the architecture
  of the build computer.  At present, this feature is only supported on
  GNU/Linux, and not all architectures are recognized.  If the auto-detect
-@@ -14013,7 +14061,7 @@ Permissible names are: @samp{arm2}, @samp{arm250},
+@@ -14012,8 +14061,10 @@ Permissible names are: @samp{arm2}, @samp{arm250},
  @samp{generic-armv7-a}, @samp{cortex-a5}, @samp{cortex-a7}, @samp{cortex-a8},
  @samp{cortex-a9}, @samp{cortex-a12}, @samp{cortex-a15}, @samp{cortex-a17},
  @samp{cortex-a32}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a57},
 - at samp{cortex-a72}, @samp{cortex-r4},
 + at samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-r4},
  @samp{cortex-r4f}, @samp{cortex-r5}, @samp{cortex-r7}, @samp{cortex-r8},
++ at samp{cortex-m33},
++ at samp{cortex-m23},
  @samp{cortex-m7},
  @samp{cortex-m4},
-@@ -14035,7 +14083,8 @@ Permissible names are: @samp{arm2}, @samp{arm250},
+ @samp{cortex-m3},
+@@ -14034,7 +14085,8 @@ Permissible names are: @samp{arm2}, @samp{arm250},
  Additionally, this option can specify that GCC should tune the performance
  of the code for a big.LITTLE system.  Permissible names are:
  @samp{cortex-a15.cortex-a7}, @samp{cortex-a17.cortex-a7},
@@ -319,7 +450,7 @@
  
  @option{-mtune=generic- at var{arch}} specifies that GCC should tune the
  performance for a blend of processors within architecture @var{arch}.
-@@ -14165,9 +14214,12 @@ otherwise the default is @samp{R10}.
+@@ -14164,9 +14216,12 @@ otherwise the default is @samp{R10}.
  
  @item -mpic-data-is-text-relative
  @opindex mpic-data-is-text-relative
@@ -335,7 +466,7 @@
  
  @item -mpoke-function-name
  @opindex mpoke-function-name
-@@ -14277,10 +14329,10 @@ generating these instructions.  This option is enabled by default when
+@@ -14276,10 +14331,10 @@ generating these instructions.  This option is enabled by default when
  @opindex mno-unaligned-access
  Enables (or disables) reading and writing of 16- and 32- bit values
  from addresses that are not 16- or 32- bit aligned.  By default
@@ -350,7 +481,20 @@
  
  The ARM attribute @code{Tag_CPU_unaligned_access} is set in the
  generated object file to either true or false, depending upon the
-@@ -18082,7 +18134,7 @@ IEEE 754 floating-point data.
+@@ -14319,6 +14374,12 @@ Print CPU tuning information as comment in assembler file.  This is
+ an option used only for regression testing of the compiler and not
+ intended for ordinary use in compiling code.  This option is disabled
+ by default.
++
++ at item -mcmse
++ at opindex mcmse
++Generate secure code as per the "ARMv8-M Security Extensions: Requirements on
++Development Tools Engineering Specification", which can be found on
++ at url{http://infocenter.arm.com/help/topic/com.arm.doc.ecm0359818/ECM0359818_armv8m_security_extensions_reqs_on_dev_tools_1_0.pdf}.
+ @end table
+ 
+ @node AVR Options
+@@ -18081,7 +18142,7 @@ IEEE 754 floating-point data.
  
  The @option{-mnan=legacy} option selects the legacy encoding.  In this
  case quiet NaNs (qNaNs) are denoted by the first bit of their trailing
@@ -411,7 +555,7 @@
  ARM target supports options to generate ARMv8.1 Adv.SIMD instructions.
  Some multilibs may be incompatible with these options.
  
-@@ -1597,10 +1615,43 @@ ARM target supports executing ARMv8.1 Adv.SIMD instructions.  Some
+@@ -1597,10 +1615,47 @@ ARM target supports executing ARMv8.1 Adv.SIMD instructions.  Some
  multilibs may be incompatible with the options needed.  Implies
  arm_v8_1a_neon_ok.
  
@@ -452,10 +596,14 @@
 +ARM target generates Thumb-1 code for @code{-mthumb} with
 + at code{CBZ} and @code{CBNZ} instructions available.
 +
++ at item arm_cmse_ok
++ARM target supports ARMv8-M Security Extensions, enabled by the @code{-mcmse}
++option.
++
  @end table
  
  @subsubsection AArch64-specific attributes
-@@ -2066,6 +2117,28 @@ NEON support.  Only ARM targets support this feature, and only then
+@@ -2066,6 +2121,28 @@ NEON support.  Only ARM targets support this feature, and only then
  in certain modes; see the @ref{arm_neon_ok,,arm_neon_ok effective target
  keyword}.
  
@@ -484,7 +632,7 @@
  @item arm_neon_fp16
  NEON and half-precision floating point support.  Only ARM targets
  support this feature, and only then in certain modes; see
-@@ -2075,6 +2148,23 @@ the @ref{arm_neon_fp16_ok,,arm_neon_fp16_ok effective target keyword}.
+@@ -2075,6 +2152,23 @@ the @ref{arm_neon_fp16_ok,,arm_neon_fp16_ok effective target keyword}.
  arm vfp3 floating point support; see
  the @ref{arm_vfp3_ok,,arm_vfp3_ok effective target keyword}.
  
diff --git a/debian/patches/gcc-linaro-no-macros.diff b/debian/patches/gcc-linaro-no-macros.diff
index 6d5a29e..8755be9 100644
--- a/debian/patches/gcc-linaro-no-macros.diff
+++ b/debian/patches/gcc-linaro-no-macros.diff
@@ -89,7 +89,7 @@ Index: b/src/gcc/LINARO-VERSION
 --- a/src/gcc/LINARO-VERSION
 +++ /dev/null
 @@ -1,1 +0,0 @@
--Snapshot 6.2-2016.10
+-Snapshot 6.3-2017.01
 Index: b/src/gcc/configure.ac
 ===================================================================
 --- a/src/gcc/configure.ac
diff --git a/debian/patches/gcc-linaro-r243646-revert.diff b/debian/patches/gcc-linaro-r243646-revert.diff
deleted file mode 100644
index a3638b1..0000000
--- a/debian/patches/gcc-linaro-r243646-revert.diff
+++ /dev/null
@@ -1,88 +0,0 @@
-# Revert r243646 for Linaro builds, already backported to the Linaro branch.
-
-2016-12-14  Wilco Dijkstra  <wdijkstr at arm.com>
-	    Jakub Jelinek  <jakub at redhat.com>
-
-	PR target/78796
-	* config/aarch64/aarch64.c (aarch64_classify_symbol): Merge large
-	model checks into switch.
-
-2016-12-14  Jakub Jelinek  <jakub at redhat.com>
-
-	PR target/78796
-	* gcc.dg/tls/pr78796.c: New test.
-
- 
---- a/src/gcc/testsuite/gcc.dg/tls/pr78796.c
-+++ b/src/gcc/testsuite/gcc.dg/tls/pr78796.c
-@@ -1,32 +0,0 @@
--/* PR target/78796 */
--/* { dg-do run } */
--/* { dg-options "-O2" } */
--/* { dg-additional-options "-mcmodel=large" { target aarch64-*-* } } */
--/* { dg-require-effective-target tls } */
--
--struct S { int a, b, c, d, e; };
--struct S t;
--__thread struct S s;
--
--__attribute__((used, noinline, noclone)) void
--foo (int *x, int *y)
--{
--  asm volatile ("" : : "g" (x), "g" (y) : "memory");
--  if (*x != 1 || *y != 2)
--    __builtin_abort ();
--}
--
--__attribute__((used, noinline, noclone)) void
--bar (void)
--{
--  foo (&t.c, &s.c);
--}
--
--int
--main ()
--{
--  t.c = 1;
--  s.c = 2;
--  bar ();
--  return 0;
--}
---- a/src/gcc/config/aarch64/aarch64.c
-+++ b/src/gcc/config/aarch64/aarch64.c
-@@ -9280,6 +9280,18 @@
- 
-   if (GET_CODE (x) == SYMBOL_REF)
-     {
-+      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
-+	{
-+	  /* This is alright even in PIC code as the constant
-+	     pool reference is always PC relative and within
-+	     the same translation unit.  */
-+	  if (nopcrelative_literal_loads
-+	      && CONSTANT_POOL_ADDRESS_P (x))
-+	    return SYMBOL_SMALL_ABSOLUTE;
-+	  else
-+	    return SYMBOL_FORCE_TO_MEM;
-+	}
-+
-       if (aarch64_tls_symbol_p (x))
- 	return aarch64_classify_tls_symbol (x);
- 
-@@ -9320,16 +9332,6 @@
- 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
- 	  return SYMBOL_SMALL_ABSOLUTE;
- 
--	case AARCH64_CMODEL_LARGE:
--	  /* This is alright even in PIC code as the constant
--	     pool reference is always PC relative and within
--	     the same translation unit.  */
--	  if (nopcrelative_literal_loads
--	      && CONSTANT_POOL_ADDRESS_P (x))
--	    return SYMBOL_SMALL_ABSOLUTE;
--	  else
--	    return SYMBOL_FORCE_TO_MEM;
--
- 	default:
- 	  gcc_unreachable ();
- 	}
diff --git a/debian/patches/gcc-linaro-r244161-revert.diff b/debian/patches/gcc-linaro-r244161-revert.diff
deleted file mode 100644
index 3a73a5d..0000000
--- a/debian/patches/gcc-linaro-r244161-revert.diff
+++ /dev/null
@@ -1,117 +0,0 @@
-# DP: Revert r244161, already backported to Linaro
-
---- a/src/gcc/testsuite/gcc.target/arm/pr78041.c
-+++ a/src/gcc/testsuite/gcc.target/arm/pr78041.c
-@@ -1,20 +0,0 @@
--/* { dg-require-effective-target arm_thumb2_ok } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-fno-inline -mthumb -O1 -mfpu=neon -w" } */
--
--extern void abort (void);
--
--register long long x asm ("r1");
--
--long long f (void)
--{
--  return x << 5;
--}
--
--int main ()
--{
--  x = 0x0100000001;
--  if (f () != 0x2000000020)
--    abort ();
--  return 0;
--}
---- a/src/gcc/config/arm/neon.md
-+++ a/src/gcc/config/arm/neon.md
-@@ -1045,12 +1045,12 @@
- )
- 
- (define_insn_and_split "ashldi3_neon"
--  [(set (match_operand:DI 0 "s_register_operand"	    "= w, w,?&r,?r,?&r, ?w,w")
--	(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w,w")
--		   (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm,i")))
--   (clobber (match_scratch:SI 3				    "= X, X,?&r, X,  X,  X,X"))
--   (clobber (match_scratch:SI 4				    "= X, X,?&r, X,  X,  X,X"))
--   (clobber (match_scratch:DI 5				    "=&w, X,  X, X,  X, &w,X"))
-+  [(set (match_operand:DI 0 "s_register_operand"	    "= w, w,?&r,?r, ?w,w")
-+	(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, r, 0w,w")
-+		   (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,rUm,i")))
-+   (clobber (match_scratch:SI 3				    "= X, X,?&r, X,  X,X"))
-+   (clobber (match_scratch:SI 4				    "= X, X,?&r, X,  X,X"))
-+   (clobber (match_scratch:DI 5				    "=&w, X,  X, X, &w,X"))
-    (clobber (reg:CC_C CC_REGNUM))]
-   "TARGET_NEON"
-   "#"
-@@ -1082,11 +1082,9 @@
-       }
-     else
-       {
--	/* The shift expanders support either full overlap or no overlap.  */
--	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
--		    || REGNO (operands[0]) == REGNO (operands[1]));
--
--	if (operands[2] == CONST1_RTX (SImode))
-+	if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1
-+	    && (!reg_overlap_mentioned_p (operands[0], operands[1])
-+		|| REGNO (operands[0]) == REGNO (operands[1])))
- 	  /* This clobbers CC.  */
- 	  emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
- 	else
-@@ -1095,8 +1093,8 @@
-       }
-     DONE;
-   }"
--  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
--   (set_attr "opt" "*,*,speed,speed,speed,*,*")
-+  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
-+   (set_attr "opt" "*,*,speed,speed,*,*")
-    (set_attr "type" "multiple")]
- )
- 
-@@ -1145,12 +1143,12 @@
- ;; ashrdi3_neon
- ;; lshrdi3_neon
- (define_insn_and_split "<shift>di3_neon"
--  [(set (match_operand:DI 0 "s_register_operand"	     "= w, w,?&r,?r,?&r,?w,?w")
--	(RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r,0w, w")
--		    (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i,  i, r, i")))
--   (clobber (match_scratch:SI 3				     "=2r, X, &r, X,  X,2r, X"))
--   (clobber (match_scratch:SI 4				     "= X, X, &r, X,  X, X, X"))
--   (clobber (match_scratch:DI 5				     "=&w, X,  X, X, X,&w, X"))
-+  [(set (match_operand:DI 0 "s_register_operand"	     "= w, w,?&r,?r,?w,?w")
-+	(RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, r,0w, w")
-+		    (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i, r, i")))
-+   (clobber (match_scratch:SI 3				     "=2r, X, &r, X,2r, X"))
-+   (clobber (match_scratch:SI 4				     "= X, X, &r, X, X, X"))
-+   (clobber (match_scratch:DI 5				     "=&w, X,  X, X,&w, X"))
-    (clobber (reg:CC CC_REGNUM))]
-   "TARGET_NEON"
-   "#"
-@@ -1186,11 +1184,9 @@
-       }
-     else
-       {
--	/* The shift expanders support either full overlap or no overlap.  */
--	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
--		    || REGNO (operands[0]) == REGNO (operands[1]));
--
--	if (operands[2] == CONST1_RTX (SImode))
-+	if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1
-+	    && (!reg_overlap_mentioned_p (operands[0], operands[1])
-+		|| REGNO (operands[0]) == REGNO (operands[1])))
- 	  /* This clobbers CC.  */
- 	  emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1]));
- 	else
-@@ -1201,8 +1197,8 @@
- 
-     DONE;
-   }"
--  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
--   (set_attr "opt" "*,*,speed,speed,speed,*,*")
-+  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
-+   (set_attr "opt" "*,*,speed,speed,*,*")
-    (set_attr "type" "multiple")]
- )
- 
diff --git a/debian/patches/gcc-linaro-r244242-revert.diff b/debian/patches/gcc-linaro-r244242-revert.diff
new file mode 100644
index 0000000..4f2a0e0
--- /dev/null
+++ b/debian/patches/gcc-linaro-r244242-revert.diff
@@ -0,0 +1,184 @@
+Index: configure.ac
+===================================================================
+--- a/src/configure.ac	(revision 244242)
++++ a/src/configure.ac	(revision 244241)
+@@ -819,9 +819,6 @@
+   *-*-vxworks*)
+     noconfigdirs="$noconfigdirs ${libgcj}"
+     ;;
+-  aarch64*-*-freebsd*)
+-    noconfigdirs="$noconfigdirs ${libgcj}"
+-    ;;
+   alpha*-*-*vms*)
+     noconfigdirs="$noconfigdirs ${libgcj}"
+     ;;
+Index: libgcc/config.host
+===================================================================
+--- a/src/libgcc/config.host	(revision 244242)
++++ a/src/libgcc/config.host	(revision 244241)
+@@ -333,11 +333,6 @@
+ 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+ 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+ 	;;
+-aarch64*-*-freebsd*)
+-	extra_parts="$extra_parts crtfastmath.o"
+-	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+-	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+-	;;
+ aarch64*-*-linux*)
+ 	extra_parts="$extra_parts crtfastmath.o"
+ 	md_unwind_header=aarch64/linux-unwind.h
+Index: gcc/config.gcc
+===================================================================
+--- a/src/gcc/config.gcc	(revision 244242)
++++ a/src/gcc/config.gcc	(revision 244241)
+@@ -946,11 +946,6 @@
+ 	done
+ 	TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
+ 	;;
+-aarch64*-*-freebsd*)
+-	tm_file="${tm_file} dbxelf.h elfos.h ${fbsd_tm_file}"
+-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-freebsd.h"
+-	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-freebsd"
+-	;;
+ aarch64*-*-linux*)
+ 	tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
+ 	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
+Index: gcc/config.host
+===================================================================
+--- a/src/gcc/config.host	(revision 244242)
++++ a/src/gcc/config.host	(revision 244241)
+@@ -99,7 +99,7 @@
+ esac
+ 
+ case ${host} in
+-  aarch64*-*-freebsd* | aarch64*-*-linux*)
++  aarch64*-*-linux*)
+     case ${target} in
+       aarch64*-*-*)
+ 	host_extra_gcc_objs="driver-aarch64.o"
+Index: gcc/config/aarch64/t-aarch64-freebsd
+===================================================================
+--- a/src/gcc/config/aarch64/t-aarch64-freebsd	(revision 244242)
++++ a/src/gcc/config/aarch64/t-aarch64-freebsd	(nonexistent)
+@@ -1,21 +0,0 @@
+-# Machine description for AArch64 architecture.
+-#  Copyright (C) 2016-2017 Free Software Foundation, Inc.
+-#
+-#  This file is part of GCC.
+-#
+-#  GCC is free software; you can redistribute it and/or modify it
+-#  under the terms of the GNU General Public License as published by
+-#  the Free Software Foundation; either version 3, or (at your option)
+-#  any later version.
+-#
+-#  GCC is distributed in the hope that it will be useful, but
+-#  WITHOUT ANY WARRANTY; without even the implied warranty of
+-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-#  General Public License for more details.
+-#
+-#  You should have received a copy of the GNU General Public License
+-#  along with GCC; see the file COPYING3.  If not see
+-#  <http://www.gnu.org/licenses/>.
+-
+-LIB1ASMSRC   = aarch64/lib1funcs.asm
+-LIB1ASMFUNCS = _aarch64_sync_cache_range
+Index: gcc/config/aarch64/aarch64-freebsd.h
+===================================================================
+--- a/src/gcc/config/aarch64/aarch64-freebsd.h	(revision 244242)
++++ a/src/gcc/config/aarch64/aarch64-freebsd.h	(nonexistent)
+@@ -1,94 +0,0 @@
+-/* Definitions for AArch64 running FreeBSD
+-   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+-
+-   This file is part of GCC.
+-
+-   GCC is free software; you can redistribute it and/or modify it
+-   under the terms of the GNU General Public License as published by
+-   the Free Software Foundation; either version 3, or (at your option)
+-   any later version.
+-
+-   GCC is distributed in the hope that it will be useful, but
+-   WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   General Public License for more details.
+-
+-   You should have received a copy of the GNU General Public License
+-   along with GCC; see the file COPYING3.  If not see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#ifndef GCC_AARCH64_FREEBSD_H
+-#define GCC_AARCH64_FREEBSD_H
+-
+-#undef  SUBTARGET_CPP_SPEC
+-#define SUBTARGET_CPP_SPEC FBSD_CPP_SPEC
+-
+-#if TARGET_BIG_ENDIAN_DEFAULT
+-#define TARGET_LINKER_EMULATION  "aarch64fbsdb"
+-#else
+-#define TARGET_LINKER_EMULATION  "aarch64fbsd"
+-#endif
+-
+-#undef  SUBTARGET_EXTRA_LINK_SPEC
+-#define SUBTARGET_EXTRA_LINK_SPEC " -m" TARGET_LINKER_EMULATION
+-
+-#undef  FBSD_TARGET_LINK_SPEC
+-#define FBSD_TARGET_LINK_SPEC "                                 \
+-    %{p:%nconsider using `-pg' instead of `-p' with gprof (1) } \
+-    %{v:-V}                                                     \
+-    %{assert*} %{R*} %{rpath*} %{defsym*}                       \
+-    %{shared:-Bshareable %{h*} %{soname*}}                      \
+-    %{symbolic:-Bsymbolic}                                      \
+-    %{static:-Bstatic}                                          \
+-    %{!static:                                                  \
+-      %{rdynamic:-export-dynamic}                               \
+-      %{!shared:-dynamic-linker " FBSD_DYNAMIC_LINKER " }}      \
+-    -X" SUBTARGET_EXTRA_LINK_SPEC "                             \
+-    %{mbig-endian:-EB} %{mlittle-endian:-EL}"
+-
+-#if TARGET_FIX_ERR_A53_835769_DEFAULT
+-#define CA53_ERR_835769_SPEC \
+-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#else
+-#define CA53_ERR_835769_SPEC \
+-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#endif
+-
+-#ifdef TARGET_FIX_ERR_A53_843419_DEFAULT
+-#define CA53_ERR_843419_SPEC \
+-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#else
+-#define CA53_ERR_843419_SPEC \
+-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#endif
+-
+-#undef  LINK_SPEC
+-#define LINK_SPEC FBSD_TARGET_LINK_SPEC	\
+-                  CA53_ERR_835769_SPEC	\
+-                  CA53_ERR_843419_SPEC
+-
+-#define GNU_USER_TARGET_MATHFILE_SPEC \
+-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+-
+-#undef ENDFILE_SPEC
+-#define ENDFILE_SPEC \
+-    GNU_USER_TARGET_MATHFILE_SPEC " " \
+-    FBSD_ENDFILE_SPEC
+-
+-#undef  TARGET_OS_CPP_BUILTINS
+-#define TARGET_OS_CPP_BUILTINS()              \
+-  do                                          \
+-  {                                           \
+-      FBSD_TARGET_OS_CPP_BUILTINS ();         \
+-  }                                           \
+-  while (false)
+-
+-#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
+-
+-/* Uninitialized common symbols in non-PIE executables, even with
+-   strong definitions in dependent shared libraries, will resolve
+-   to COPY relocated symbol in the executable.  See PR65780.  */
+-#undef TARGET_BINDS_LOCAL_P
+-#define TARGET_BINDS_LOCAL_P default_binds_local_p_2
+-
+-#endif  /* GCC_AARCH64_FREEBSD_H */
diff --git a/debian/patches/gcc-linaro.diff b/debian/patches/gcc-linaro.diff
index 3096f82..2ab9e5d 100644
--- a/debian/patches/gcc-linaro.diff
+++ b/debian/patches/gcc-linaro.diff
@@ -1,16 +1,140 @@
-# DP: Changes for the Linaro 6-2016.10 release.
+# DP: Changes for the Linaro 6-2017.01 release.
 
 MSG=$(git log origin/linaro/gcc-6-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-6-branch --format=format:"%H" -n 1 --grep "gcc-6-branch@${SVN%.}"
 
-LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee917a01b63e72dc399c81d26259e78aa \
+LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b2388c31a0f6c7a82110ee77baf2c8bf \
  | egrep -v '^(diff|index) ' \
  | filterdiff --strip=1 --addoldprefix=a/src/  --addnewprefix=b/src/ \
  | sed 's,a/src//dev/null,/dev/null,'
 
+--- a/src/configure
++++ b/src/configure
+@@ -3483,6 +3483,9 @@ case "${target}" in
+   *-*-vxworks*)
+     noconfigdirs="$noconfigdirs ${libgcj}"
+     ;;
++  aarch64*-*-freebsd*)
++    noconfigdirs="$noconfigdirs target-libffi"
++    ;;
+   alpha*-*-*vms*)
+     noconfigdirs="$noconfigdirs ${libgcj}"
+     ;;
+--- a/src/configure.ac
++++ b/src/configure.ac
+@@ -819,6 +819,9 @@ case "${target}" in
+   *-*-vxworks*)
+     noconfigdirs="$noconfigdirs ${libgcj}"
+     ;;
++  aarch64*-*-freebsd*)
++    noconfigdirs="$noconfigdirs target-libffi"
++    ;;
+   alpha*-*-*vms*)
+     noconfigdirs="$noconfigdirs ${libgcj}"
+     ;;
+--- a/src/contrib/compare_tests
++++ b/src/contrib/compare_tests
+@@ -107,8 +107,8 @@ elif [ -d "$1" -o -d "$2" ] ; then
+ 	usage "Must specify either two directories or two files"
+ fi
+ 
+-sed 's/^XFAIL/FAIL/; s/^XPASS/PASS/' < "$1" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp1
+-sed 's/^XFAIL/FAIL/; s/^XPASS/PASS/' < "$2" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp2
++sed 's/^XFAIL/FAIL/; s/^ERROR/FAIL/; s/^XPASS/PASS/' < "$1" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp1
++sed 's/^XFAIL/FAIL/; s/^ERROR/FAIL/; s/^XPASS/PASS/' < "$2" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp2
+ 
+ before=$tmp1
+ now=$tmp2
+--- a/src/contrib/dg-extract-results.py
++++ b/src/contrib/dg-extract-results.py
+@@ -134,6 +134,7 @@ class Prog:
+         self.end_line = None
+         # Known summary types.
+         self.count_names = [
++            '# of DejaGnu errors\t\t',
+             '# of expected passes\t\t',
+             '# of unexpected failures\t',
+             '# of unexpected successes\t',
+@@ -245,6 +246,10 @@ class Prog:
+             segment = Segment (filename, file.tell())
+             variation.header = segment
+ 
++        # Parse the rest of the summary (the '# of ' lines).
++        if len (variation.counts) == 0:
++            variation.counts = self.zero_counts()
++
+         # Parse up until the first line of the summary.
+         if num_variations == 1:
+             end = '\t\t=== ' + tool.name + ' Summary ===\n'
+@@ -291,6 +296,11 @@ class Prog:
+                 harness.results.append ((key, line))
+                 if not first_key and sort_logs:
+                     first_key = key
++                if line.startswith ('ERROR: (DejaGnu)'):
++                    for i in range (len (self.count_names)):
++                        if 'DejaGnu errors' in self.count_names[i]:
++                            variation.counts[i] += 1
++                            break
+ 
+             # 'Using ...' lines are only interesting in a header.  Splitting
+             # the test up into parallel runs leads to more 'Using ...' lines
+@@ -309,9 +319,6 @@ class Prog:
+             segment.lines -= final_using
+             harness.add_segment (first_key, segment)
+ 
+-        # Parse the rest of the summary (the '# of ' lines).
+-        if len (variation.counts) == 0:
+-            variation.counts = self.zero_counts()
+         while True:
+             before = file.tell()
+             line = file.readline()
+--- a/src/contrib/dg-extract-results.sh
++++ b/src/contrib/dg-extract-results.sh
+@@ -369,10 +369,11 @@ EOF
+ BEGIN {
+   variant="$VAR"
+   tool="$TOOL"
+-  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kpasscnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0;
++  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kpasscnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0; dgerrorcnt=0;
+   curvar=""; insummary=0
+ }
+ /^Running target /		{ curvar = \$3; next }
++/^ERROR: \(DejaGnu\)/		{ if (variant == curvar) dgerrorcnt += 1 }
+ /^# of /			{ if (variant == curvar) insummary = 1 }
+ /^# of expected passes/		{ if (insummary == 1) passcnt += \$5; next; }
+ /^# of unexpected successes/	{ if (insummary == 1) xpasscnt += \$5; next; }
+@@ -390,6 +391,7 @@ BEGIN {
+ { next }
+ END {
+   printf ("\t\t=== %s Summary for %s ===\n\n", tool, variant)
++  if (dgerrorcnt != 0) printf ("# of DejaGnu errors\t\t%d\n", dgerrorcnt)
+   if (passcnt != 0) printf ("# of expected passes\t\t%d\n", passcnt)
+   if (failcnt != 0) printf ("# of unexpected failures\t%d\n", failcnt)
+   if (xpasscnt != 0) printf ("# of unexpected successes\t%d\n", xpasscnt)
+@@ -419,8 +421,9 @@ TOTAL_AWK=${TMP}/total.awk
+ cat << EOF > $TOTAL_AWK
+ BEGIN {
+   tool="$TOOL"
+-  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0
++  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0; dgerrorcnt=0
+ }
++/^# of DejaGnu errors/		{ dgerrorcnt += \$5 }
+ /^# of expected passes/		{ passcnt += \$5 }
+ /^# of unexpected failures/	{ failcnt += \$5 }
+ /^# of unexpected successes/	{ xpasscnt += \$5 }
+@@ -431,7 +434,8 @@ BEGIN {
+ /^# of unresolved testcases/	{ unrescnt += \$5 }
+ /^# of unsupported tests/	{ unsupcnt += \$5 }
+ END {
+-  printf ("\n\t\t=== %s Summary ===\n\n", tool)
++  printf ("\n\t\t=== %s MySummary ===\n\n", tool)
++  if (dgerrorcnt != 0) printf ("# of DejaGnu errors\t\t%d\n", dgerrorcnt)
+   if (passcnt != 0) printf ("# of expected passes\t\t%d\n", passcnt)
+   if (failcnt != 0) printf ("# of unexpected failures\t%d\n", failcnt)
+   if (xpasscnt != 0) printf ("# of unexpected successes\t%d\n", xpasscnt)
 --- /dev/null
 +++ b/src/gcc/LINARO-VERSION
 @@ -0,0 +1 @@
-+Snapshot 6.2-2016.10
++Snapshot 6.3-2017.01
 --- a/src/gcc/Makefile.in
 +++ b/src/gcc/Makefile.in
 @@ -832,10 +832,12 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x.y
@@ -46,6 +170,50 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  CFLAGS-cppdefault.o += $(PREPROCESSOR_DEFINES)
  
+--- a/src/gcc/ada/gcc-interface/misc.c
++++ b/src/gcc/ada/gcc-interface/misc.c
+@@ -255,8 +255,7 @@ static bool
+ gnat_post_options (const char **pfilename ATTRIBUTE_UNUSED)
+ {
+   /* Excess precision other than "fast" requires front-end support.  */
+-  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
+-      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
++  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
+     sorry ("-fexcess-precision=standard for Ada");
+   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
+ 
+--- a/src/gcc/builtins.c
++++ b/src/gcc/builtins.c
+@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "predict.h"
+ #include "tm_p.h"
+--- a/src/gcc/c-family/c-common.c
++++ b/src/gcc/c-family/c-common.c
+@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "function.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "c-common.h"
+ #include "gimple-expr.h"
+ #include "tm_p.h"
+--- a/src/gcc/c-family/c-opts.c
++++ b/src/gcc/c-family/c-opts.c
+@@ -767,8 +767,7 @@ c_common_post_options (const char **pfilename)
+      support.  */
+   if (c_dialect_cxx ())
+     {
+-      if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
+-	  && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
++      if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
+ 	sorry ("-fexcess-precision=standard for C++");
+       flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
+     }
 --- a/src/gcc/calls.c
 +++ b/src/gcc/calls.c
 @@ -194,10 +194,19 @@ prepare_call_address (tree fndecl_or_type, rtx funexp, rtx static_chain_value,
@@ -82,6 +250,38 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    delete loop_copy;
    loop_copy = NULL;
    delete original_copy_bb_pool;
+--- a/src/gcc/common/config/arm/arm-common.c
++++ b/src/gcc/common/config/arm/arm-common.c
+@@ -97,6 +97,29 @@ arm_rewrite_mcpu (int argc, const char **argv)
+   return arm_rewrite_selected_cpu (argv[argc - 1]);
+ }
+ 
++/* Called by the driver to check whether the target denoted by current
++   command line options is a Thumb-only target.  ARGV is an array of
++   -march and -mcpu values (ie. it contains the rhs after the equal
++   sign) and we use the last one of them to make a decision.  The
++   number of elements in ARGV is given in ARGC.  */
++const char *
++arm_target_thumb_only (int argc, const char **argv)
++{
++  unsigned int opt;
++
++  if (argc)
++    {
++      for (opt = 0; opt < (ARRAY_SIZE (arm_arch_core_flags)); opt++)
++	if ((strcmp (argv[argc - 1], arm_arch_core_flags[opt].name) == 0)
++	    && !ARM_FSET_HAS_CPU1(arm_arch_core_flags[opt].flags, FL_NOTM))
++	  return "-mthumb";
++
++      return NULL;
++    }
++  else
++    return NULL;
++}
++
+ #undef ARM_CPU_NAME_LENGTH
+ 
+ 
 --- a/src/gcc/config.gcc
 +++ b/src/gcc/config.gcc
 @@ -307,7 +307,7 @@ m32c*-*-*)
@@ -98,11 +298,23 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	cpu_type=arm
  	extra_objs="arm-builtins.o aarch-common.o"
 -	extra_headers="mmintrin.h arm_neon.h arm_acle.h"
-+	extra_headers="mmintrin.h arm_neon.h arm_acle.h arm_fp16.h"
++	extra_headers="mmintrin.h arm_neon.h arm_acle.h arm_fp16.h arm_cmse.h"
  	target_type_format_char='%'
  	c_target_objs="arm-c.o"
  	cxx_target_objs="arm-c.o"
-@@ -1495,7 +1495,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
+@@ -946,6 +946,11 @@ aarch64*-*-elf | aarch64*-*-rtems*)
+ 	done
+ 	TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
+ 	;;
++aarch64*-*-freebsd*)
++	tm_file="${tm_file} dbxelf.h elfos.h ${fbsd_tm_file}"
++	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-freebsd.h"
++	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-freebsd"
++	;;
+ aarch64*-*-linux*)
+ 	tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
+ 	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
+@@ -1495,7 +1500,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
  		extra_options="${extra_options} linux-android.opt"
  		# Assume modern glibc if not targeting Android nor uclibc.
  		case ${target} in
@@ -111,7 +323,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		  ;;
  		*)
  		  default_gnu_indirect_function=yes
-@@ -1564,7 +1564,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu)
+@@ -1564,7 +1569,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu)
  		extra_options="${extra_options} linux-android.opt"
  		# Assume modern glibc if not targeting Android nor uclibc.
  		case ${target} in
@@ -120,7 +332,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		  ;;
  		*)
  		  default_gnu_indirect_function=yes
-@@ -3806,38 +3806,40 @@ case "${target}" in
+@@ -3806,38 +3811,51 @@ case "${target}" in
  		# Add extra multilibs
  		if test "x$with_multilib_list" != x; then
  			arm_multilibs=`echo $with_multilib_list | sed -e 's/,/ /g'`
@@ -138,21 +350,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -				# pragmatic. Additionally it is only
 -				# designed to work without any
 -				# with-cpu, with-arch with-mode
-+				# pragmatic.
-+				tmake_profile_file="arm/t-aprofile"
-+				;;
-+			default)
-+				;;
-+			*)
-+				echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
-+				exit 1
-+				;;
-+			esac
-+
-+			if test "x${tmake_profile_file}" != x ; then
-+				# arm/t-aprofile is only designed to work
-+				# without any with-cpu, with-arch, with-mode,
- 				# with-fpu or with-float options.
+-				# with-fpu or with-float options.
 -					if test "x$with_arch" != x \
 -					    || test "x$with_cpu" != x \
 -					    || test "x$with_float" != x \
@@ -172,6 +370,32 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					;;
 -				esac
 -			done
++				# pragmatic.
++				tmake_profile_file="arm/t-aprofile"
++				;;
++			rmprofile)
++				# Note that arm/t-rmprofile is a
++				# stand-alone make file fragment to be
++				# used only with itself.  We do not
++				# specifically use the
++				# TM_MULTILIB_OPTION framework because
++				# this shorthand is more
++				# pragmatic.
++				tmake_profile_file="arm/t-rmprofile"
++				;;
++			default)
++				;;
++			*)
++				echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
++				exit 1
++				;;
++			esac
++
++			if test "x${tmake_profile_file}" != x ; then
++				# arm/t-aprofile and arm/t-rmprofile are only
++				# designed to work without any with-cpu,
++				# with-arch, with-mode, with-fpu or with-float
++				# options.
 +				if test "x$with_arch" != x \
 +				    || test "x$with_cpu" != x \
 +				    || test "x$with_float" != x \
@@ -186,6 +410,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		fi
  		;;
  
+--- a/src/gcc/config.host
++++ b/src/gcc/config.host
+@@ -99,7 +99,7 @@ case ${host} in
+ esac
+ 
+ case ${host} in
+-  aarch64*-*-linux*)
++  aarch64*-*-freebsd* | aarch64*-*-linux*)
+     case ${target} in
+       aarch64*-*-*)
+ 	host_extra_gcc_objs="driver-aarch64.o"
 --- a/src/gcc/config/aarch64/aarch64-arches.def
 +++ b/src/gcc/config/aarch64/aarch64-arches.def
 @@ -32,4 +32,5 @@
@@ -215,7 +450,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
    = { qualifier_poly, qualifier_poly, qualifier_poly };
  #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
-@@ -173,6 +178,10 @@ aarch64_types_shift_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+@@ -164,6 +169,10 @@ aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ #define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
+ 
+ static enum aarch64_type_qualifiers
++aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_poly, qualifier_none, qualifier_immediate };
++#define TYPES_GETREGP (aarch64_types_binop_imm_p_qualifiers)
++static enum aarch64_type_qualifiers
+ aarch64_types_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_none, qualifier_none, qualifier_immediate };
+ #define TYPES_GETREG (aarch64_types_binop_imm_qualifiers)
+@@ -173,16 +182,29 @@ aarch64_types_shift_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
    = { qualifier_unsigned, qualifier_none, qualifier_immediate };
  #define TYPES_SHIFTIMM_USS (aarch64_types_shift_to_unsigned_qualifiers)
  static enum aarch64_type_qualifiers
@@ -226,6 +472,73 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  aarch64_types_unsigned_shift_qualifiers[SIMD_MAX_BUILTIN_ARGS]
    = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate };
  #define TYPES_USHIFTIMM (aarch64_types_unsigned_shift_qualifiers)
+ 
+ static enum aarch64_type_qualifiers
+-aarch64_types_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+-  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate };
+-#define TYPES_SETREG (aarch64_types_ternop_imm_qualifiers)
+-#define TYPES_SHIFTINSERT (aarch64_types_ternop_imm_qualifiers)
+-#define TYPES_SHIFTACC (aarch64_types_ternop_imm_qualifiers)
++aarch64_types_ternop_s_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_none, qualifier_none, qualifier_poly, qualifier_immediate};
++#define TYPES_SETREGP (aarch64_types_ternop_s_imm_p_qualifiers)
++static enum aarch64_type_qualifiers
++aarch64_types_ternop_s_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate};
++#define TYPES_SETREG (aarch64_types_ternop_s_imm_qualifiers)
++#define TYPES_SHIFTINSERT (aarch64_types_ternop_s_imm_qualifiers)
++#define TYPES_SHIFTACC (aarch64_types_ternop_s_imm_qualifiers)
++
++static enum aarch64_type_qualifiers
++aarch64_types_ternop_p_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_immediate};
++#define TYPES_SHIFTINSERTP (aarch64_types_ternop_p_imm_qualifiers)
+ 
+ static enum aarch64_type_qualifiers
+ aarch64_types_unsigned_shiftacc_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+@@ -197,6 +219,11 @@ aarch64_types_combine_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ #define TYPES_COMBINE (aarch64_types_combine_qualifiers)
+ 
+ static enum aarch64_type_qualifiers
++aarch64_types_combine_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_poly, qualifier_poly, qualifier_poly };
++#define TYPES_COMBINEP (aarch64_types_combine_p_qualifiers)
++
++static enum aarch64_type_qualifiers
+ aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_none, qualifier_const_pointer_map_mode };
+ #define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
+@@ -229,6 +256,10 @@ aarch64_types_bsl_u_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+    qualifier_map_mode | qualifier_pointer to build a pointer to the
+    element type of the vector.  */
+ static enum aarch64_type_qualifiers
++aarch64_types_store1_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_void, qualifier_pointer_map_mode, qualifier_poly };
++#define TYPES_STORE1P (aarch64_types_store1_p_qualifiers)
++static enum aarch64_type_qualifiers
+ aarch64_types_store1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_void, qualifier_pointer_map_mode, qualifier_none };
+ #define TYPES_STORE1 (aarch64_types_store1_qualifiers)
+@@ -753,16 +784,16 @@ aarch64_init_simd_builtins (void)
+ 
+ 	  if (qualifiers & qualifier_unsigned)
+ 	    {
+-	      type_signature[arg_num] = 'u';
++	      type_signature[op_num] = 'u';
+ 	      print_type_signature_p = true;
+ 	    }
+ 	  else if (qualifiers & qualifier_poly)
+ 	    {
+-	      type_signature[arg_num] = 'p';
++	      type_signature[op_num] = 'p';
+ 	      print_type_signature_p = true;
+ 	    }
+ 	  else
+-	    type_signature[arg_num] = 's';
++	    type_signature[op_num] = 's';
+ 
+ 	  /* Skip an internal operand for vget_{low, high}.  */
+ 	  if (qualifiers & qualifier_internal)
 --- a/src/gcc/config/aarch64/aarch64-c.c
 +++ b/src/gcc/config/aarch64/aarch64-c.c
 @@ -95,6 +95,11 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
@@ -242,19 +555,33 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 --- a/src/gcc/config/aarch64/aarch64-cores.def
 +++ b/src/gcc/config/aarch64/aarch64-cores.def
-@@ -44,13 +44,19 @@ AARCH64_CORE("cortex-a35",  cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AA
+@@ -40,17 +40,33 @@
+ 
+ /* V8 Architecture Processors.  */
+ 
++/* ARM ('A') cores. */
+ AARCH64_CORE("cortex-a35",  cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, "0x41", "0xd04")
  AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
  AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
  AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
 +AARCH64_CORE("cortex-a73",  cortexa73, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09")
++
++/* Samsung ('S') cores. */
  AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  "0x53", "0x001")
 -AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57, "0x51", "0x800")
++
++/* Qualcomm ('Q') cores. */
 +AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   "0x51", "0x800")
++
++/* Cavium ('C') cores. */
  AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
++
++/* APM ('P') cores. */
  AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
  
 +/* V8.1 Architecture Processors.  */
 +
++/* Broadcom ('B') cores. */
 +AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, vulcan, "0x42", "0x516")
 +
  /* V8 big.LITTLE implementations.  */
@@ -393,6 +720,103 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #define TEXT_SECTION_ASM_OP	"\t.text"
  #define DATA_SECTION_ASM_OP	"\t.data"
  #define BSS_SECTION_ASM_OP	"\t.bss"
+--- /dev/null
++++ b/src/gcc/config/aarch64/aarch64-freebsd.h
+@@ -0,0 +1,94 @@
++/* Definitions for AArch64 running FreeBSD
++   Copyright (C) 2016 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_FREEBSD_H
++#define GCC_AARCH64_FREEBSD_H
++
++#undef  SUBTARGET_CPP_SPEC
++#define SUBTARGET_CPP_SPEC FBSD_CPP_SPEC
++
++#if TARGET_BIG_ENDIAN_DEFAULT
++#define TARGET_LINKER_EMULATION  "aarch64fbsdb"
++#else
++#define TARGET_LINKER_EMULATION  "aarch64fbsd"
++#endif
++
++#undef  SUBTARGET_EXTRA_LINK_SPEC
++#define SUBTARGET_EXTRA_LINK_SPEC " -m" TARGET_LINKER_EMULATION
++
++#undef  FBSD_TARGET_LINK_SPEC
++#define FBSD_TARGET_LINK_SPEC "                                 \
++    %{p:%nconsider using `-pg' instead of `-p' with gprof (1) } \
++    %{v:-V}                                                     \
++    %{assert*} %{R*} %{rpath*} %{defsym*}                       \
++    %{shared:-Bshareable %{h*} %{soname*}}                      \
++    %{symbolic:-Bsymbolic}                                      \
++    %{static:-Bstatic}                                          \
++    %{!static:                                                  \
++      %{rdynamic:-export-dynamic}                               \
++      %{!shared:-dynamic-linker " FBSD_DYNAMIC_LINKER " }}      \
++    -X" SUBTARGET_EXTRA_LINK_SPEC "                             \
++    %{mbig-endian:-EB} %{mlittle-endian:-EL}"
++
++#if TARGET_FIX_ERR_A53_835769_DEFAULT
++#define CA53_ERR_835769_SPEC \
++  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
++#else
++#define CA53_ERR_835769_SPEC \
++  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
++#endif
++
++#ifdef TARGET_FIX_ERR_A53_843419_DEFAULT
++#define CA53_ERR_843419_SPEC \
++  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
++#else
++#define CA53_ERR_843419_SPEC \
++  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
++#endif
++
++#undef  LINK_SPEC
++#define LINK_SPEC FBSD_TARGET_LINK_SPEC	\
++                  CA53_ERR_835769_SPEC	\
++                  CA53_ERR_843419_SPEC
++
++#define GNU_USER_TARGET_MATHFILE_SPEC \
++  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
++
++#undef ENDFILE_SPEC
++#define ENDFILE_SPEC \
++    GNU_USER_TARGET_MATHFILE_SPEC " " \
++    FBSD_ENDFILE_SPEC
++
++#undef  TARGET_OS_CPP_BUILTINS
++#define TARGET_OS_CPP_BUILTINS()              \
++  do                                          \
++  {                                           \
++      FBSD_TARGET_OS_CPP_BUILTINS ();         \
++  }                                           \
++  while (false)
++
++#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
++
++/* Uninitialized common symbols in non-PIE executables, even with
++   strong definitions in dependent shared libraries, will resolve
++   to COPY relocated symbol in the executable.  See PR65780.  */
++#undef TARGET_BINDS_LOCAL_P
++#define TARGET_BINDS_LOCAL_P default_binds_local_p_2
++
++#endif  /* GCC_AARCH64_FREEBSD_H */
 --- a/src/gcc/config/aarch64/aarch64-modes.def
 +++ b/src/gcc/config/aarch64/aarch64-modes.def
 @@ -21,8 +21,6 @@
@@ -517,9 +941,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  void init_aarch64_simd_builtins (void);
 --- a/src/gcc/config/aarch64/aarch64-simd-builtins.def
 +++ b/src/gcc/config/aarch64/aarch64-simd-builtins.def
-@@ -41,8 +41,8 @@
+@@ -40,9 +40,10 @@
+    10 - CODE_FOR_<name><mode>.  */
  
    BUILTIN_VDC (COMBINE, combine, 0)
++  VAR1 (COMBINEP, combine, 0, di)
    BUILTIN_VB (BINOP, pmul, 0)
 -  BUILTIN_VALLF (BINOP, fmulx, 0)
 -  BUILTIN_VDQF_DF (UNOP, sqrt, 2)
@@ -528,7 +954,39 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    BUILTIN_VD_BHSI (BINOP, addp, 0)
    VAR1 (UNOP, addp, 0, di)
    BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
-@@ -234,105 +234,145 @@
+@@ -68,14 +69,23 @@
+   BUILTIN_VDC (GETREG, get_dregoi, 0)
+   BUILTIN_VDC (GETREG, get_dregci, 0)
+   BUILTIN_VDC (GETREG, get_dregxi, 0)
++  VAR1 (GETREGP, get_dregoi, 0, di)
++  VAR1 (GETREGP, get_dregci, 0, di)
++  VAR1 (GETREGP, get_dregxi, 0, di)
+   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
+   BUILTIN_VQ (GETREG, get_qregoi, 0)
+   BUILTIN_VQ (GETREG, get_qregci, 0)
+   BUILTIN_VQ (GETREG, get_qregxi, 0)
++  VAR1 (GETREGP, get_qregoi, 0, v2di)
++  VAR1 (GETREGP, get_qregci, 0, v2di)
++  VAR1 (GETREGP, get_qregxi, 0, v2di)
+   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
+   BUILTIN_VQ (SETREG, set_qregoi, 0)
+   BUILTIN_VQ (SETREG, set_qregci, 0)
+   BUILTIN_VQ (SETREG, set_qregxi, 0)
++  VAR1 (SETREGP, set_qregoi, 0, v2di)
++  VAR1 (SETREGP, set_qregci, 0, v2di)
++  VAR1 (SETREGP, set_qregxi, 0, v2di)
+   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
+   BUILTIN_VDC (LOADSTRUCT, ld2, 0)
+   BUILTIN_VDC (LOADSTRUCT, ld3, 0)
+@@ -224,6 +234,7 @@
+   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
+   BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
+   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
++  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
+   BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
+   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
+   BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
+@@ -234,105 +245,145 @@
    BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
  
    /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
@@ -696,7 +1154,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    VAR1 (UNOP, floatunsv2si, 2, v2sf)
    VAR1 (UNOP, floatunsv4si, 2, v4sf)
    VAR1 (UNOP, floatunsv2di, 2, v2df)
-@@ -352,19 +392,19 @@
+@@ -352,19 +403,19 @@
  
    /* Implemented by
       aarch64_frecp<FRECP:frecp_suffix><mode>.  */
@@ -722,8 +1180,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
    VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
-@@ -381,7 +421,11 @@
+@@ -376,15 +427,22 @@
+ 
+   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
+   BUILTIN_VALL_F16 (LOAD1, ld1, 0)
++  VAR1(STORE1P, ld1, 0, v2di)
+ 
+   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
    BUILTIN_VALL_F16 (STORE1, st1, 0)
++  VAR1(STORE1P, st1, 0, v2di)
  
    /* Implemented by fma<mode>4.  */
 -  BUILTIN_VDQF (TERNOP, fma, 4)
@@ -735,7 +1200,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    /* Implemented by aarch64_simd_bsl<mode>.  */
    BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
-@@ -436,7 +480,7 @@
++  VAR2 (BSL_P, simd_bsl,0, di, v2di)
+   BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
+   BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
+ 
+@@ -436,7 +494,7 @@
    VAR1 (TERNOP, qtbx4, 0, v8qi)
    VAR1 (TERNOP, qtbx4, 0, v16qi)
  
@@ -744,7 +1213,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
    BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
-@@ -449,3 +493,60 @@
+@@ -449,3 +507,60 @@
    /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
    BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
    BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
@@ -887,13 +1356,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	(abs:VDQF (minus:VDQF
 -		   (match_operand:VDQF 1 "register_operand" "w")
 -		   (match_operand:VDQF 2 "register_operand" "w"))))]
-+(define_insn "fabd<mode>3"
-+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
-+	(abs:VHSDF_HSDF
-+	  (minus:VHSDF_HSDF
-+	    (match_operand:VHSDF_HSDF 1 "register_operand" "w")
-+	    (match_operand:VHSDF_HSDF 2 "register_operand" "w"))))]
-   "TARGET_SIMD"
+-  "TARGET_SIMD"
 -  "fabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
 -  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
 -)
@@ -903,7 +1366,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -        (abs:GPF (minus:GPF
 -                 (match_operand:GPF 1 "register_operand" "w")
 -                 (match_operand:GPF 2 "register_operand" "w"))))]
--  "TARGET_SIMD"
++(define_insn "fabd<mode>3"
++  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
++	(abs:VHSDF_HSDF
++	  (minus:VHSDF_HSDF
++	    (match_operand:VHSDF_HSDF 1 "register_operand" "w")
++	    (match_operand:VHSDF_HSDF 2 "register_operand" "w"))))]
+   "TARGET_SIMD"
 -  "fabd\t%<s>0, %<s>1, %<s>2"
 -  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
 +  "fabd\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
@@ -1697,10 +2166,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	     (const_int 0))) 0))]
 -  "TARGET_SIMD"
 +  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-+  "ld1\\t{%S0.1d - %T0.1d}, %1"
-+  [(set_attr "type" "neon_load1_2reg<q>")]
-+)
-+
+   "ld1\\t{%S0.1d - %T0.1d}, %1"
+   [(set_attr "type" "neon_load1_2reg<q>")]
+ )
+ 
+-(define_insn "aarch64_ld3<mode>_dreg"
 +(define_insn "aarch64_ld2<mode>_dreg_be"
 +  [(set (match_operand:OI 0 "register_operand" "=w")
 +	(subreg:OI
@@ -1715,11 +2185,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +	     (unspec:DX [(match_dup 1)]
 +			UNSPEC_LD2))) 0))]
 +  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-   "ld1\\t{%S0.1d - %T0.1d}, %1"
-   [(set_attr "type" "neon_load1_2reg<q>")]
- )
- 
--(define_insn "aarch64_ld3<mode>_dreg"
++  "ld1\\t{%S0.1d - %T0.1d}, %1"
++  [(set_attr "type" "neon_load1_2reg<q>")]
++)
++
 +(define_insn "aarch64_ld3<mode>_dreg_le"
    [(set (match_operand:CI 0 "register_operand" "=w")
  	(subreg:CI
@@ -2033,6 +2502,33 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; sha1
+@@ -5435,6 +5671,26 @@
+   [(set_attr "type" "crypto_sha1_fast")]
+ )
+ 
++(define_insn "aarch64_crypto_sha1hv4si"
++  [(set (match_operand:SI 0 "register_operand" "=w")
++	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
++		     (parallel [(const_int 0)]))]
++	 UNSPEC_SHA1H))]
++  "TARGET_SIMD && TARGET_CRYPTO && !BYTES_BIG_ENDIAN"
++  "sha1h\\t%s0, %s1"
++  [(set_attr "type" "crypto_sha1_fast")]
++)
++
++(define_insn "aarch64_be_crypto_sha1hv4si"
++  [(set (match_operand:SI 0 "register_operand" "=w")
++	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
++		     (parallel [(const_int 3)]))]
++	 UNSPEC_SHA1H))]
++  "TARGET_SIMD && TARGET_CRYPTO && BYTES_BIG_ENDIAN"
++  "sha1h\\t%s0, %s1"
++  [(set_attr "type" "crypto_sha1_fast")]
++)
++
+ (define_insn "aarch64_crypto_sha1su1v4si"
+   [(set (match_operand:V4SI 0 "register_operand" "=w")
+         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
 --- a/src/gcc/config/aarch64/aarch64-tune.md
 +++ b/src/gcc/config/aarch64/aarch64-tune.md
 @@ -1,5 +1,5 @@
@@ -2056,7 +2552,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
 --- a/src/gcc/config/aarch64/aarch64.c
 +++ b/src/gcc/config/aarch64/aarch64.c
-@@ -152,7 +152,7 @@ enum aarch64_processor aarch64_tune = cortexa53;
+@@ -26,6 +26,7 @@
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "cfghooks.h"
+ #include "cfgloop.h"
+@@ -152,7 +153,7 @@ enum aarch64_processor aarch64_tune = cortexa53;
  unsigned long aarch64_tune_flags = 0;
  
  /* Global flag for PC relative loads.  */
@@ -2065,7 +2569,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  /* Support for command line parsing of boolean flags in the tuning
     structures.  */
-@@ -250,6 +250,38 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
+@@ -250,6 +251,38 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
    0, /* imm_offset  */
  };
  
@@ -2104,7 +2608,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  static const struct cpu_regmove_cost generic_regmove_cost =
  {
    1, /* GP2GP  */
-@@ -308,6 +340,24 @@ static const struct cpu_regmove_cost xgene1_regmove_cost =
+@@ -308,6 +341,24 @@ static const struct cpu_regmove_cost xgene1_regmove_cost =
    2 /* FP2FP  */
  };
  
@@ -2129,7 +2633,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Generic costs for vector insn classes.  */
  static const struct cpu_vector_cost generic_vector_cost =
  {
-@@ -326,6 +376,24 @@ static const struct cpu_vector_cost generic_vector_cost =
+@@ -326,6 +377,24 @@ static const struct cpu_vector_cost generic_vector_cost =
    1 /* cond_not_taken_branch_cost  */
  };
  
@@ -2154,7 +2658,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Generic costs for vector insn classes.  */
  static const struct cpu_vector_cost cortexa57_vector_cost =
  {
-@@ -379,6 +447,24 @@ static const struct cpu_vector_cost xgene1_vector_cost =
+@@ -379,6 +448,24 @@ static const struct cpu_vector_cost xgene1_vector_cost =
    1 /* cond_not_taken_branch_cost  */
  };
  
@@ -2179,7 +2683,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Generic costs for branch instructions.  */
  static const struct cpu_branch_cost generic_branch_cost =
  {
-@@ -393,6 +479,37 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
+@@ -393,6 +480,37 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
    3   /* Unpredictable.  */
  };
  
@@ -2217,7 +2721,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  static const struct tune_params generic_tunings =
  {
    &cortexa57_extra_costs,
-@@ -400,6 +517,7 @@ static const struct tune_params generic_tunings =
+@@ -400,6 +518,7 @@ static const struct tune_params generic_tunings =
    &generic_regmove_cost,
    &generic_vector_cost,
    &generic_branch_cost,
@@ -2225,7 +2729,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    4, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_NOTHING, /* fusible_ops  */
-@@ -423,14 +541,15 @@ static const struct tune_params cortexa35_tunings =
+@@ -423,14 +542,15 @@ static const struct tune_params cortexa35_tunings =
    &generic_addrcost_table,
    &cortexa53_regmove_cost,
    &generic_vector_cost,
@@ -2245,7 +2749,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    2,	/* int_reassoc_width.  */
    4,	/* fp_reassoc_width.  */
    1,	/* vec_reassoc_width.  */
-@@ -448,14 +567,15 @@ static const struct tune_params cortexa53_tunings =
+@@ -448,14 +568,15 @@ static const struct tune_params cortexa53_tunings =
    &generic_addrcost_table,
    &cortexa53_regmove_cost,
    &generic_vector_cost,
@@ -2264,7 +2768,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    2,	/* int_reassoc_width.  */
    4,	/* fp_reassoc_width.  */
    1,	/* vec_reassoc_width.  */
-@@ -474,13 +594,14 @@ static const struct tune_params cortexa57_tunings =
+@@ -474,13 +595,14 @@ static const struct tune_params cortexa57_tunings =
    &cortexa57_regmove_cost,
    &cortexa57_vector_cost,
    &cortexa57_branch_cost,
@@ -2280,7 +2784,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    2,	/* int_reassoc_width.  */
    4,	/* fp_reassoc_width.  */
    1,	/* vec_reassoc_width.  */
-@@ -498,14 +619,15 @@ static const struct tune_params cortexa72_tunings =
+@@ -498,14 +620,15 @@ static const struct tune_params cortexa72_tunings =
    &cortexa57_addrcost_table,
    &cortexa57_regmove_cost,
    &cortexa57_vector_cost,
@@ -2298,7 +2802,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    2,	/* int_reassoc_width.  */
    4,	/* fp_reassoc_width.  */
    1,	/* vec_reassoc_width.  */
-@@ -513,7 +635,33 @@ static const struct tune_params cortexa72_tunings =
+@@ -513,7 +636,33 @@ static const struct tune_params cortexa72_tunings =
    2,	/* min_div_recip_mul_df.  */
    0,	/* max_case_values.  */
    0,	/* cache_line_size.  */
@@ -2333,7 +2837,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
  };
  
-@@ -524,6 +672,7 @@ static const struct tune_params exynosm1_tunings =
+@@ -524,6 +673,7 @@ static const struct tune_params exynosm1_tunings =
    &exynosm1_regmove_cost,
    &exynosm1_vector_cost,
    &generic_branch_cost,
@@ -2341,7 +2845,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    4,	/* memmov_cost  */
    3,	/* issue_rate  */
    (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
-@@ -538,7 +687,7 @@ static const struct tune_params exynosm1_tunings =
+@@ -538,7 +688,7 @@ static const struct tune_params exynosm1_tunings =
    48,	/* max_case_values.  */
    64,	/* cache_line_size.  */
    tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
@@ -2350,7 +2854,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  };
  
  static const struct tune_params thunderx_tunings =
-@@ -546,8 +695,9 @@ static const struct tune_params thunderx_tunings =
+@@ -546,8 +696,9 @@ static const struct tune_params thunderx_tunings =
    &thunderx_extra_costs,
    &generic_addrcost_table,
    &thunderx_regmove_cost,
@@ -2361,7 +2865,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    6, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
-@@ -562,7 +712,7 @@ static const struct tune_params thunderx_tunings =
+@@ -562,7 +713,7 @@ static const struct tune_params thunderx_tunings =
    0,	/* max_case_values.  */
    0,	/* cache_line_size.  */
    tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
@@ -2370,7 +2874,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  };
  
  static const struct tune_params xgene1_tunings =
-@@ -572,6 +722,7 @@ static const struct tune_params xgene1_tunings =
+@@ -572,6 +723,7 @@ static const struct tune_params xgene1_tunings =
    &xgene1_regmove_cost,
    &xgene1_vector_cost,
    &generic_branch_cost,
@@ -2378,7 +2882,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    6, /* memmov_cost  */
    4, /* issue_rate  */
    AARCH64_FUSE_NOTHING, /* fusible_ops  */
-@@ -586,7 +737,58 @@ static const struct tune_params xgene1_tunings =
+@@ -586,7 +738,58 @@ static const struct tune_params xgene1_tunings =
    0,	/* max_case_values.  */
    0,	/* cache_line_size.  */
    tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
@@ -2438,7 +2942,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  };
  
  /* Support for fine-grained override of the tuning structures.  */
-@@ -663,16 +865,6 @@ struct aarch64_option_extension
+@@ -663,16 +866,6 @@ struct aarch64_option_extension
    const unsigned long flags_off;
  };
  
@@ -2455,7 +2959,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  typedef enum aarch64_cond_code
  {
    AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
-@@ -1703,7 +1895,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+@@ -1703,7 +1896,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
  	     we need to expand the literal pool access carefully.
  	     This is something that needs to be done in a number
  	     of places, so could well live as a separate function.  */
@@ -2464,30 +2968,35 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	    {
  	      gcc_assert (can_create_pseudo_p ());
  	      base = gen_reg_rtx (ptr_mode);
-@@ -1766,6 +1958,61 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+@@ -1766,6 +1959,88 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
    aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
  }
  
-+/* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to held
-+   intermediate value if necessary.
++/* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
++   temporary value if necessary.  FRAME_RELATED_P should be true if
++   the RTX_FRAME_RELATED flag should be set and CFA adjustments added
++   to the generated instructions.  If SCRATCHREG is known to hold
++   abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
++   immediate again.
 +
-+   This function is sometimes used to adjust the stack pointer, so we must
-+   ensure that it can never cause transient stack deallocation by writing an
-+   invalid value into REGNUM.  */
++   Since this function may be used to adjust the stack pointer, we must
++   ensure that it cannot cause transient stack deallocation (for example
++   by first incrementing SP and then decrementing when adjusting by a
++   large immediate).  */
 +
 +static void
-+aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
-+		      HOST_WIDE_INT delta, bool frame_related_p)
++aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
++			       HOST_WIDE_INT delta, bool frame_related_p,
++			       bool emit_move_imm)
 +{
 +  HOST_WIDE_INT mdelta = abs_hwi (delta);
 +  rtx this_rtx = gen_rtx_REG (mode, regnum);
 +  rtx_insn *insn;
 +
-+  /* Do nothing if mdelta is zero.  */
 +  if (!mdelta)
 +    return;
 +
-+  /* We only need single instruction if the offset fit into add/sub.  */
++  /* Single instruction adjustment.  */
 +  if (aarch64_uimm12_shift (mdelta))
 +    {
 +      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
@@ -2495,11 +3004,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +      return;
 +    }
 +
-+  /* We need two add/sub instructions, each one performing part of the
-+     calculation.  Don't do this if the addend can be loaded into register with
-+     a single instruction, in that case we prefer a move to a scratch register
-+     following by an addition.  */
-+  if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
++  /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
++     Only do this if mdelta is not a 16-bit move as adjusting using a move
++     is better.  */
++  if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
 +    {
 +      HOST_WIDE_INT low_off = mdelta & 0xfff;
 +
@@ -2511,10 +3019,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +      return;
 +    }
 +
-+  /* Otherwise use generic function to handle all other situations.  */
++  /* Emit a move immediate if required and an addition/subtraction.  */
 +  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
-+  aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
-+  insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
++  if (emit_move_imm)
++    aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
++  insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
++			      : gen_add2_insn (this_rtx, scratch_rtx));
 +  if (frame_related_p)
 +    {
 +      RTX_FRAME_RELATED_P (insn) = frame_related_p;
@@ -2523,10 +3033,40 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    }
 +}
 +
++static inline void
++aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
++		      HOST_WIDE_INT delta)
++{
++  aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
++}
++
++static inline void
++aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
++{
++  aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
++				 true, emit_move_imm);
++}
++
++static inline void
++aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
++{
++  aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
++				 frame_related_p, true);
++}
++
  static bool
  aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
  				 tree exp ATTRIBUTE_UNUSED)
-@@ -2498,8 +2745,8 @@ aarch64_layout_frame (void)
+@@ -2490,7 +2765,7 @@ static void
+ aarch64_layout_frame (void)
+ {
+   HOST_WIDE_INT offset = 0;
+-  int regno;
++  int regno, last_fp_reg = INVALID_REGNUM;
+ 
+   if (reload_completed && cfun->machine->frame.laid_out)
+     return;
+@@ -2498,8 +2773,8 @@ aarch64_layout_frame (void)
  #define SLOT_NOT_REQUIRED (-2)
  #define SLOT_REQUIRED     (-1)
  
@@ -2537,7 +3077,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    /* First mark all the registers that really need to be saved...  */
    for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
-@@ -2533,7 +2780,6 @@ aarch64_layout_frame (void)
+@@ -2524,7 +2799,10 @@ aarch64_layout_frame (void)
+   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+     if (df_regs_ever_live_p (regno)
+ 	&& !call_used_regs[regno])
+-      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
++      {
++	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
++	last_fp_reg = regno;
++      }
+ 
+   if (frame_pointer_needed)
+     {
+@@ -2533,7 +2811,6 @@ aarch64_layout_frame (void)
        cfun->machine->frame.wb_candidate1 = R29_REGNUM;
        cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
        cfun->machine->frame.wb_candidate2 = R30_REGNUM;
@@ -2545,7 +3097,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        offset += 2 * UNITS_PER_WORD;
      }
  
-@@ -2542,9 +2788,9 @@ aarch64_layout_frame (void)
+@@ -2542,35 +2819,46 @@ aarch64_layout_frame (void)
      if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
        {
  	cfun->machine->frame.reg_offset[regno] = offset;
@@ -2557,9 +3109,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  cfun->machine->frame.wb_candidate2 = regno;
  	offset += UNITS_PER_WORD;
        }
-@@ -2553,24 +2799,23 @@ aarch64_layout_frame (void)
+ 
++  HOST_WIDE_INT max_int_offset = offset;
++  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
++  bool has_align_gap = offset != max_int_offset;
++
+   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
      if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
        {
++	/* If there is an alignment gap between integer and fp callee-saves,
++	   allocate the last fp register to it if possible.  */
++	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
++	  {
++	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
++	    break;
++	  }
++
  	cfun->machine->frame.reg_offset[regno] = offset;
 -	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
 +	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
@@ -2588,7 +3153,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		STACK_BOUNDARY / BITS_PER_UNIT);
  
    cfun->machine->frame.frame_size
-@@ -2578,6 +2823,77 @@ aarch64_layout_frame (void)
+@@ -2578,15 +2866,92 @@ aarch64_layout_frame (void)
  		+ crtl->outgoing_args_size,
  		STACK_BOUNDARY / BITS_PER_UNIT);
  
@@ -2666,9 +3231,49 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    cfun->machine->frame.laid_out = true;
  }
  
-@@ -2631,10 +2947,14 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
++/* Return true if the register REGNO is saved on entry to
++   the current function.  */
++
+ static bool
+ aarch64_register_saved_on_entry (int regno)
+ {
+   return cfun->machine->frame.reg_offset[regno] >= 0;
+ }
+ 
++/* Return the next register up from REGNO up to LIMIT for the callee
++   to save.  */
++
+ static unsigned
+ aarch64_next_callee_save (unsigned regno, unsigned limit)
+ {
+@@ -2595,6 +2960,9 @@ aarch64_next_callee_save (unsigned regno, unsigned limit)
+   return regno;
+ }
+ 
++/* Push the register number REGNO of mode MODE to the stack with write-back
++   adjusting the stack by ADJUSTMENT.  */
++
+ static void
+ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
+ 			   HOST_WIDE_INT adjustment)
+@@ -2611,6 +2979,10 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
+   RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ 
++/* Generate and return an instruction to store the pair of registers
++   REG and REG2 of mode MODE to location BASE with write-back adjusting
++   the stack location BASE by ADJUSTMENT.  */
++
+ static rtx
+ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
+ 			  HOST_WIDE_INT adjustment)
+@@ -2630,11 +3002,18 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
+     }
  }
  
++/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
++   stack pointer by ADJUSTMENT.  */
++
  static void
 -aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
 -			 unsigned regno2, HOST_WIDE_INT adjustment)
@@ -2683,10 +3288,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    rtx reg1 = gen_rtx_REG (mode, regno1);
    rtx reg2 = gen_rtx_REG (mode, regno2);
  
-@@ -2662,6 +2982,30 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
+@@ -2645,6 +3024,9 @@ aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
+   RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ 
++/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
++   adjusting it by ADJUSTMENT afterwards.  */
++
+ static rtx
+ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
+ 			 HOST_WIDE_INT adjustment)
+@@ -2662,6 +3044,37 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
      }
  }
  
++/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
++   afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
++   into CFI_OPS.  */
++
 +static void
 +aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
 +		  rtx *cfi_ops)
@@ -2711,10 +3330,45 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    }
 +}
 +
++/* Generate and return a store pair instruction of mode MODE to store
++   register REG1 to MEM1 and register REG2 to MEM2.  */
++
  static rtx
  aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
  			rtx reg2)
-@@ -2848,23 +3192,16 @@ aarch64_restore_callee_saves (machine_mode mode,
+@@ -2679,6 +3092,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
+     }
+ }
+ 
++/* Generate and regurn a load pair isntruction of mode MODE to load register
++   REG1 from MEM1 and register REG2 from MEM2.  */
++
+ static rtx
+ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
+ 		       rtx mem2)
+@@ -2696,6 +3112,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
+     }
+ }
+ 
++/* Emit code to save the callee-saved registers from register number START
++   to LIMIT to the stack at the location starting at offset START_OFFSET,
++   skipping any write-back candidates if SKIP_WB is true.  */
+ 
+ static void
+ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
+@@ -2754,6 +3173,11 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
+     }
+ }
+ 
++/* Emit code to restore the callee registers of mode MODE from register
++   number START up to and including LIMIT.  Restore from the stack offset
++   START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
++   Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
++
+ static void
+ aarch64_restore_callee_saves (machine_mode mode,
+ 			      HOST_WIDE_INT start_offset, unsigned start,
+@@ -2848,23 +3272,16 @@ aarch64_restore_callee_saves (machine_mode mode,
  void
  aarch64_expand_prologue (void)
  {
@@ -2746,7 +3400,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    if (flag_stack_usage_info)
      current_function_static_stack_size = frame_size;
-@@ -2881,129 +3218,29 @@ aarch64_expand_prologue (void)
+@@ -2881,129 +3298,28 @@ aarch64_expand_prologue (void)
  	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
      }
  
@@ -2765,14 +3419,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -
 -      frame_size -= (offset + crtl->outgoing_args_size);
 -      fp_offset = 0;
-+  aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
++  aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
  
 -      if (frame_size >= 0x1000000)
 -	{
 -	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
 -	  emit_move_insn (op0, GEN_INT (-frame_size));
 -	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
--
++  if (callee_adjust != 0)
++    aarch64_push_regs (reg1, reg2, callee_adjust);
+ 
 -	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
 -			gen_rtx_SET (stack_pointer_rtx,
 -				     plus_constant (Pmode, stack_pointer_rtx,
@@ -2783,9 +3439,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	{
 -	  int hi_ofs = frame_size & 0xfff000;
 -	  int lo_ofs = frame_size & 0x000fff;
-+  if (callee_adjust != 0)
-+    aarch64_push_regs (reg1, reg2, callee_adjust);
- 
+-
 -	  if (hi_ofs)
 -	    {
 -	      insn = emit_insn (gen_add2_insn
@@ -2889,12 +3543,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +			     callee_adjust != 0 || frame_pointer_needed);
 +  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
 +			     callee_adjust != 0 || frame_pointer_needed);
-+  aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
-+			!frame_pointer_needed);
++  aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
  }
  
  /* Return TRUE if we can use a simple_return insn.
-@@ -3026,150 +3263,79 @@ aarch64_use_return_insn_p (void)
+@@ -3026,150 +3342,79 @@ aarch64_use_return_insn_p (void)
    return cfun->machine->frame.frame_size == 0;
  }
  
@@ -2980,7 +3633,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +      RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
      }
 +  else
-+    aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
++    aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
  
 -  if (offset > 0)
 -    {
@@ -3073,8 +3726,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	{
 -          int hi_ofs = frame_size & 0xfff000;
 -          int lo_ofs = frame_size & 0x000fff;
-+  aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
- 
+-
 -	  if (hi_ofs && lo_ofs)
 -	    {
 -	      insn = emit_insn (gen_add2_insn
@@ -3085,7 +3737,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	  insn = emit_insn (gen_add2_insn
 -			    (stack_pointer_rtx, GEN_INT (frame_size)));
 -	}
--
++  aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
+ 
 -      /* Reset the CFA to be SP + 0.  */
 -      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
 +  if (cfi_ops)
@@ -3097,7 +3750,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        RTX_FRAME_RELATED_P (insn) = 1;
      }
  
-@@ -3237,122 +3403,6 @@ aarch64_final_eh_return_addr (void)
+@@ -3237,122 +3482,6 @@ aarch64_final_eh_return_addr (void)
  				       - 2 * UNITS_PER_WORD));
  }
  
@@ -3220,25 +3873,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Output code to add DELTA to the first argument, and then jump
     to FUNCTION.  Used for C++ multiple inheritance.  */
  static void
-@@ -3373,7 +3423,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -3373,7 +3502,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
    emit_note (NOTE_INSN_PROLOGUE_END);
  
    if (vcall_offset == 0)
 -    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
-+    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
++    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
    else
      {
        gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
-@@ -3389,7 +3439,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -3389,7 +3518,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
  	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
  				       plus_constant (Pmode, this_rtx, delta));
  	  else
 -	    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
-+	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
++	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
  	}
  
        if (Pmode == ptr_mode)
-@@ -3403,7 +3453,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -3403,7 +3532,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
  	  addr = plus_constant (Pmode, temp0, vcall_offset);
        else
  	{
@@ -3248,7 +3901,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
  	}
  
-@@ -3582,7 +3633,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+@@ -3582,7 +3712,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
    return aarch64_tls_referenced_p (x);
  }
  
@@ -3262,7 +3915,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  static unsigned int
  aarch64_case_values_threshold (void)
-@@ -3593,7 +3649,7 @@ aarch64_case_values_threshold (void)
+@@ -3593,7 +3728,7 @@ aarch64_case_values_threshold (void)
        && selected_cpu->tune->max_case_values != 0)
      return selected_cpu->tune->max_case_values;
    else
@@ -3271,7 +3924,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
  /* Return true if register REGNO is a valid index register.
-@@ -3928,9 +3984,11 @@ aarch64_classify_address (struct aarch64_address_info *info,
+@@ -3928,9 +4063,11 @@ aarch64_classify_address (struct aarch64_address_info *info,
  	     X,X: 7-bit signed scaled offset
  	     Q:   9-bit signed offset
  	     We conservatively require an offset representable in either mode.
@@ -3285,7 +3938,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		    && offset_9bit_signed_unscaled_p (mode, offset));
  
  	  /* A 7bit offset check because OImode will emit a ldp/stp
-@@ -4038,7 +4096,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
+@@ -4038,7 +4175,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
  	  return ((GET_CODE (sym) == LABEL_REF
  		   || (GET_CODE (sym) == SYMBOL_REF
  		       && CONSTANT_POOL_ADDRESS_P (sym)
@@ -3294,7 +3947,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	}
        return false;
  
-@@ -4132,6 +4190,24 @@ aarch64_legitimate_address_p (machine_mode mode, rtx x,
+@@ -4132,6 +4269,24 @@ aarch64_legitimate_address_p (machine_mode mode, rtx x,
    return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
  }
  
@@ -3319,7 +3972,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Return TRUE if rtx X is immediate constant 0.0 */
  bool
  aarch64_float_const_zero_rtx_p (rtx x)
-@@ -4205,6 +4281,14 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
+@@ -4205,6 +4360,14 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
        && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
      return CC_NZmode;
  
@@ -3334,7 +3987,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
        && y == const0_rtx
        && (code == EQ || code == NE || code == LT || code == GE)
-@@ -4232,14 +4316,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
+@@ -4232,14 +4395,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
        && GET_CODE (x) == NEG)
      return CC_Zmode;
  
@@ -3349,7 +4002,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    /* A test for unsigned overflow.  */
    if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
        && code == NE
-@@ -4308,8 +4384,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
+@@ -4308,8 +4463,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
        break;
  
      case CC_SWPmode:
@@ -3358,7 +4011,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        switch (comp_code)
  	{
  	case NE: return AARCH64_NE;
-@@ -4964,7 +5038,7 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
+@@ -4964,7 +5117,7 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
    if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
      {
        rtx base = XEXP (x, 0);
@@ -3367,7 +4020,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        HOST_WIDE_INT offset = INTVAL (offset_rtx);
  
        if (GET_CODE (base) == PLUS)
-@@ -5022,120 +5096,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
+@@ -5022,120 +5175,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
    return x;
  }
  
@@ -3488,7 +4141,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Return the reload icode required for a constant pool in mode.  */
  static enum insn_code
  aarch64_constant_pool_reload_icode (machine_mode mode)
-@@ -5193,7 +5153,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+@@ -5193,7 +5232,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
    if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
        && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
  	  || targetm.vector_mode_supported_p (GET_MODE (x)))
@@ -3497,7 +4150,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      {
        sri->icode = aarch64_constant_pool_reload_icode (mode);
        return NO_REGS;
-@@ -5267,18 +5227,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+@@ -5267,18 +5306,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
    if (to == HARD_FRAME_POINTER_REGNUM)
      {
        if (from == ARG_POINTER_REGNUM)
@@ -3521,7 +4174,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      }
  
    return cfun->machine->frame.frame_size;
-@@ -5527,7 +5487,7 @@ aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
+@@ -5527,7 +5566,7 @@ aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
  static inline bool
  aarch64_can_use_per_function_literal_pools_p (void)
  {
@@ -3530,7 +4183,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
  }
  
-@@ -6146,6 +6106,19 @@ aarch64_extend_bitfield_pattern_p (rtx x)
+@@ -6146,6 +6185,19 @@ aarch64_extend_bitfield_pattern_p (rtx x)
    return op;
  }
  
@@ -3550,7 +4203,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Calculate the cost of calculating X, storing it in *COST.  Result
     is true if the total cost of the operation has now been calculated.  */
  static bool
-@@ -6411,10 +6384,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
+@@ -6411,10 +6463,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
            /* TODO: A write to the CC flags possibly costs extra, this
  	     needs encoding in the cost tables.  */
  
@@ -3561,7 +4214,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  mode = GET_MODE (op0);
            /* ANDS.  */
            if (GET_CODE (op0) == AND)
-@@ -6724,17 +6693,31 @@ cost_plus:
+@@ -6724,17 +6772,31 @@ cost_plus:
  
        if (GET_MODE_CLASS (mode) == MODE_INT)
  	{
@@ -3601,7 +4254,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	    }
  	  else
  	    {
-@@ -6838,11 +6821,12 @@ cost_plus:
+@@ -6838,11 +6900,12 @@ cost_plus:
  	{
  	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
  
@@ -3619,7 +4272,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	    *cost = op_cost;
  
  	  return true;
-@@ -6872,8 +6856,8 @@ cost_plus:
+@@ -6872,8 +6935,8 @@ cost_plus:
  	    }
  	  else
  	    {
@@ -3630,7 +4283,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	    }
  	}
        return false;
-@@ -7452,12 +7436,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
+@@ -7452,12 +7515,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
     to optimize 1.0/sqrt.  */
  
  static bool
@@ -3646,7 +4299,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      || flag_mrecip_low_precision_sqrt));
  }
  
-@@ -7467,89 +7451,225 @@ use_rsqrt_p (void)
+@@ -7467,89 +7530,225 @@ use_rsqrt_p (void)
  static tree
  aarch64_builtin_reciprocal (tree fndecl)
  {
@@ -3728,14 +4381,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -    mode == SFmode || mode == V2SFmode || mode == V4SFmode
 -	|| mode == DFmode || mode == V2DFmode);
 +  machine_mode mode = GET_MODE (dst);
++
++  if (GET_MODE_INNER (mode) == HFmode)
++    return false;
  
 -  rtx xsrc = gen_reg_rtx (mode);
 -  emit_move_insn (xsrc, src);
 -  rtx x0 = gen_reg_rtx (mode);
-+  if (GET_MODE_INNER (mode) == HFmode)
-+    return false;
- 
--  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
 +  machine_mode mmsk = mode_for_vector
 +		        (int_mode_for_mode (GET_MODE_INNER (mode)),
 +			 GET_MODE_NUNITS (mode));
@@ -3755,7 +4407,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +      || optimize_function_for_size_p (cfun))
 +    return false;
  
--  bool double_mode = (mode == DFmode || mode == V2DFmode);
+-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
 +  rtx xmsk = gen_reg_rtx (mmsk);
 +  if (!recp)
 +    /* When calculating the approximate square root, compare the argument with
@@ -3763,16 +4415,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
 +							  CONST0_RTX (mode)))));
  
--  int iterations = double_mode ? 3 : 2;
+-  bool double_mode = (mode == DFmode || mode == V2DFmode);
 +  /* Estimate the approximate reciprocal square root.  */
 +  rtx xdst = gen_reg_rtx (mode);
 +  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
  
--  /* Optionally iterate over the series one less time than otherwise.  */
--  if (flag_mrecip_low_precision_sqrt)
+-  int iterations = double_mode ? 3 : 2;
 +  /* Iterate over the series twice for SF and thrice for DF.  */
 +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
-+
+ 
+-  /* Optionally iterate over the series one less time than otherwise.  */
+-  if (flag_mrecip_low_precision_sqrt)
 +  /* Optionally iterate over the series once less for faster performance
 +     while sacrificing the accuracy.  */
 +  if ((recp && flag_mrecip_low_precision_sqrt)
@@ -3833,9 +4486,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    default:       gcc_unreachable ();
 +  }
 +}
-+
+ 
+-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
 +typedef rtx (*recps_type) (rtx, rtx, rtx);
-+
+ 
+-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+-      x0 = x1;
 +/* Select reciprocal series step insn depending on machine mode.  */
 +
 +static recps_type
@@ -3877,13 +4533,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  /* Estimate the approximate reciprocal.  */
 +  rtx xrcp = gen_reg_rtx (mode);
 +  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
- 
--      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
++
 +  /* Iterate over the series twice for SF and thrice for DF.  */
 +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
- 
--      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
--      x0 = x1;
++
 +  /* Optionally iterate over the series once less for faster performance,
 +     while sacrificing the accuracy.  */
 +  if (flag_mlow_precision_div)
@@ -3897,24 +4550,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +
 +      if (iterations > 0)
 +	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
-     }
- 
--  emit_move_insn (dst, x0);
++    }
++
 +  if (num != CONST1_RTX (mode))
 +    {
 +      /* As the approximate reciprocal of DEN is already calculated, only
 +	 calculate the approximate division when NUM is not 1.0.  */
 +      rtx xnum = force_reg (mode, num);
 +      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
-+    }
-+
+     }
+ 
+-  emit_move_insn (dst, x0);
 +  /* Finalize the approximation.  */
 +  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
 +  return true;
  }
  
  /* Return the number of instructions that can be issued per cycle.  */
-@@ -8053,32 +8173,37 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
+@@ -8053,32 +8252,37 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
  	opts->x_align_functions = aarch64_tune_params.function_align;
      }
  
@@ -3969,25 +4622,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
  /* 'Unpack' up the internal tuning structs and update the options
-@@ -9280,33 +9405,24 @@ aarch64_classify_symbol (rtx x, rtx offset)
- 
-   if (GET_CODE (x) == SYMBOL_REF)
-     {
--      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
--	{
--	  /* This is alright even in PIC code as the constant
--	     pool reference is always PC relative and within
--	     the same translation unit.  */
--	  if (nopcrelative_literal_loads
--	      && CONSTANT_POOL_ADDRESS_P (x))
--	    return SYMBOL_SMALL_ABSOLUTE;
--	  else
--	    return SYMBOL_FORCE_TO_MEM;
--	}
--
-       if (aarch64_tls_symbol_p (x))
- 	return aarch64_classify_tls_symbol (x);
- 
+@@ -9286,15 +9490,18 @@ aarch64_classify_symbol (rtx x, rtx offset)
        switch (aarch64_cmodel)
  	{
  	case AARCH64_CMODEL_TINY:
@@ -4009,7 +4644,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
  	    return SYMBOL_FORCE_TO_MEM;
  	  return SYMBOL_TINY_ABSOLUTE;
-@@ -9314,7 +9430,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
+@@ -9302,7 +9509,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
  	case AARCH64_CMODEL_SMALL:
  	  /* Same reasoning as the tiny code model, but the offset cap here is
  	     4G.  */
@@ -4019,23 +4654,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
  			    HOST_WIDE_INT_C (4294967264)))
  	    return SYMBOL_FORCE_TO_MEM;
-@@ -9332,6 +9449,15 @@ aarch64_classify_symbol (rtx x, rtx offset)
- 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
- 	  return SYMBOL_SMALL_ABSOLUTE;
- 
-+	case AARCH64_CMODEL_LARGE:
-+	  /* This is alright even in PIC code as the constant
-+	     pool reference is always PC relative and within
-+	     the same translation unit.  */
+@@ -9324,8 +9532,7 @@ aarch64_classify_symbol (rtx x, rtx offset)
+ 	  /* This is alright even in PIC code as the constant
+ 	     pool reference is always PC relative and within
+ 	     the same translation unit.  */
+-	  if (nopcrelative_literal_loads
+-	      && CONSTANT_POOL_ADDRESS_P (x))
 +	  if (CONSTANT_POOL_ADDRESS_P (x))
-+	    return SYMBOL_SMALL_ABSOLUTE;
-+	  else
-+	    return SYMBOL_FORCE_TO_MEM;
-+
- 	default:
- 	  gcc_unreachable ();
- 	}
-@@ -9463,6 +9589,13 @@ aarch64_build_builtin_va_list (void)
+ 	    return SYMBOL_SMALL_ABSOLUTE;
+ 	  else
+ 	    return SYMBOL_FORCE_TO_MEM;
+@@ -9461,6 +9668,13 @@ aarch64_build_builtin_va_list (void)
  			FIELD_DECL, get_identifier ("__vr_offs"),
  			integer_type_node);
  
@@ -4049,7 +4678,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    DECL_ARTIFICIAL (f_stack) = 1;
    DECL_ARTIFICIAL (f_grtop) = 1;
    DECL_ARTIFICIAL (f_vrtop) = 1;
-@@ -9495,15 +9628,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
+@@ -9493,15 +9707,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
    tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
    tree stack, grtop, vrtop, groff, vroff;
    tree t;
@@ -4073,7 +4702,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    if (!TARGET_FLOAT)
      {
-@@ -9832,7 +9967,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9830,7 +10046,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
  {
    CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
    CUMULATIVE_ARGS local_cum;
@@ -4083,7 +4712,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    /* The caller has advanced CUM up to, but not beyond, the last named
       argument.  Advance a local copy of CUM past the last "real" named
-@@ -9840,9 +9976,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9838,9 +10055,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
    local_cum = *cum;
    aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
  
@@ -4101,7 +4730,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    if (!TARGET_FLOAT)
      {
-@@ -9870,7 +10011,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9868,7 +10090,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
  	  /* We can't use move_block_from_reg, because it will use
  	     the wrong mode, storing D regs only.  */
  	  machine_mode mode = TImode;
@@ -4110,7 +4739,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  	  /* Set OFF to the offset from virtual_incoming_args_rtx of
  	     the first vector register.  The VR save area lies below
-@@ -9879,14 +10020,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9877,14 +10099,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
  			   STACK_BOUNDARY / BITS_PER_UNIT);
  	  off -= vr_saved * UNITS_PER_VREG;
  
@@ -4128,7 +4757,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      off += UNITS_PER_VREG;
  	    }
  	}
-@@ -10848,33 +10990,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
+@@ -10846,33 +11069,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
  		      gen_rtx_REG (mode, rsrc + count - i - 1));
  }
  
@@ -4162,7 +4791,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
     one of VSTRUCT modes: OI, CI, or XI.  */
  int
-@@ -11956,12 +12071,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
+@@ -11954,12 +12150,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
          info.value = GEN_INT (0);
        else
  	{
@@ -4176,7 +4805,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  	  if (lane_count == 1)
  	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
-@@ -12195,6 +12309,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
+@@ -12193,6 +12388,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
  	case V4SImode: gen = gen_aarch64_trn2v4si; break;
  	case V2SImode: gen = gen_aarch64_trn2v2si; break;
  	case V2DImode: gen = gen_aarch64_trn2v2di; break;
@@ -4185,7 +4814,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
  	case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
  	case V2DFmode: gen = gen_aarch64_trn2v2df; break;
-@@ -12213,6 +12329,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
+@@ -12211,6 +12408,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
  	case V4SImode: gen = gen_aarch64_trn1v4si; break;
  	case V2SImode: gen = gen_aarch64_trn1v2si; break;
  	case V2DImode: gen = gen_aarch64_trn1v2di; break;
@@ -4194,7 +4823,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
  	case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
  	case V2DFmode: gen = gen_aarch64_trn1v2df; break;
-@@ -12278,6 +12396,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
+@@ -12276,6 +12475,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
  	case V4SImode: gen = gen_aarch64_uzp2v4si; break;
  	case V2SImode: gen = gen_aarch64_uzp2v2si; break;
  	case V2DImode: gen = gen_aarch64_uzp2v2di; break;
@@ -4203,7 +4832,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
  	case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
  	case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
-@@ -12296,6 +12416,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
+@@ -12294,6 +12495,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
  	case V4SImode: gen = gen_aarch64_uzp1v4si; break;
  	case V2SImode: gen = gen_aarch64_uzp1v2si; break;
  	case V2DImode: gen = gen_aarch64_uzp1v2di; break;
@@ -4212,7 +4841,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
  	case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
  	case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
-@@ -12366,6 +12488,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
+@@ -12364,6 +12567,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
  	case V4SImode: gen = gen_aarch64_zip2v4si; break;
  	case V2SImode: gen = gen_aarch64_zip2v2si; break;
  	case V2DImode: gen = gen_aarch64_zip2v2di; break;
@@ -4221,7 +4850,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
  	case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
  	case V2DFmode: gen = gen_aarch64_zip2v2df; break;
-@@ -12384,6 +12508,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
+@@ -12382,6 +12587,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
  	case V4SImode: gen = gen_aarch64_zip1v4si; break;
  	case V2SImode: gen = gen_aarch64_zip1v2si; break;
  	case V2DImode: gen = gen_aarch64_zip1v2di; break;
@@ -4230,7 +4859,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
  	case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
  	case V2DFmode: gen = gen_aarch64_zip1v2df; break;
-@@ -12428,6 +12554,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
+@@ -12426,6 +12633,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
      case V8HImode: gen = gen_aarch64_extv8hi; break;
      case V2SImode: gen = gen_aarch64_extv2si; break;
      case V4SImode: gen = gen_aarch64_extv4si; break;
@@ -4239,7 +4868,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      case V2SFmode: gen = gen_aarch64_extv2sf; break;
      case V4SFmode: gen = gen_aarch64_extv4sf; break;
      case V2DImode: gen = gen_aarch64_extv2di; break;
-@@ -12503,6 +12631,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
+@@ -12501,6 +12710,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
  	case V2SImode: gen = gen_aarch64_rev64v2si;  break;
  	case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
  	case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
@@ -4248,7 +4877,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	default:
  	  return false;
  	}
-@@ -12746,24 +12876,6 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
+@@ -12744,24 +12955,6 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
    return ret;
  }
  
@@ -4273,7 +4902,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  rtx
  aarch64_reverse_mask (enum machine_mode mode)
  {
-@@ -12785,7 +12897,14 @@ aarch64_reverse_mask (enum machine_mode mode)
+@@ -12783,7 +12976,14 @@ aarch64_reverse_mask (enum machine_mode mode)
    return force_reg (V16QImode, mask);
  }
  
@@ -4289,7 +4918,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  bool
  aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
-@@ -12796,9 +12915,12 @@ aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+@@ -12794,9 +12994,12 @@ aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
    /* We specifically want to allow elements of "structure" modes to
       be tieable to the structure.  This more general condition allows
       other rarer situations too.  */
@@ -4305,7 +4934,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      return true;
  
    return false;
-@@ -13314,6 +13436,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
+@@ -13312,6 +13515,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
    return false;
  }
  
@@ -4320,7 +4949,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* If MEM is in the form of [base+offset], extract the two parts
     of address and set to BASE and OFFSET, otherwise return false
     after clearing BASE and OFFSET.  */
-@@ -13492,6 +13622,15 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
+@@ -13490,6 +13701,15 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
    if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
      return false;
  
@@ -4336,7 +4965,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    /* Check if the addresses are in the form of [base+offset].  */
    extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
    if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
-@@ -13651,6 +13790,15 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
+@@ -13649,6 +13869,15 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
  	return false;
      }
  
@@ -4352,7 +4981,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
      rclass_1 = FP_REGS;
    else
-@@ -13886,13 +14034,13 @@ aarch64_promoted_type (const_tree t)
+@@ -13884,13 +14113,13 @@ aarch64_promoted_type (const_tree t)
  /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
  
  static bool
@@ -4368,7 +4997,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
      default:
        return true;
-@@ -14026,6 +14174,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+@@ -14024,6 +14253,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
  #undef TARGET_LEGITIMATE_CONSTANT_P
  #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
  
@@ -4379,7 +5008,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef TARGET_LIBGCC_CMP_RETURN_MODE
  #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
  
-@@ -14229,6 +14381,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+@@ -14227,6 +14460,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
  #undef TARGET_OPTAB_SUPPORTED_P
  #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
  
@@ -4566,7 +5195,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      UNSPEC_USHL_2S
      UNSPEC_VSTRUCTDUMMY
      UNSPEC_SP_SET
-@@ -855,13 +859,6 @@
+@@ -856,13 +860,6 @@
  	   || aarch64_is_noplt_call_p (callee)))
        XEXP (operands[0], 0) = force_reg (Pmode, callee);
  
@@ -4580,7 +5209,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      if (operands[2] == NULL_RTX)
        operands[2] = const0_rtx;
  
-@@ -893,14 +890,6 @@
+@@ -894,14 +891,6 @@
  	   || aarch64_is_noplt_call_p (callee)))
        XEXP (operands[1], 0) = force_reg (Pmode, callee);
  
@@ -4595,7 +5224,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      if (operands[3] == NULL_RTX)
        operands[3] = const0_rtx;
  
-@@ -1178,11 +1167,12 @@
+@@ -1179,11 +1168,12 @@
  )
  
  (define_insn "*movhf_aarch64"
@@ -4610,7 +5239,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     mov\\t%0.h[0], %w1
     umov\\t%w0, %1.h[0]
     mov\\t%0.h[0], %1.h[0]
-@@ -1191,18 +1181,18 @@
+@@ -1192,18 +1182,18 @@
     ldrh\\t%w0, %1
     strh\\t%w1, %0
     mov\\t%w0, %w1"
@@ -4634,7 +5263,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     fmov\\t%s0, %w1
     fmov\\t%w0, %s1
     fmov\\t%s0, %s1
-@@ -1212,16 +1202,18 @@
+@@ -1213,16 +1203,18 @@
     ldr\\t%w0, %1
     str\\t%w1, %0
     mov\\t%w0, %w1"
@@ -4657,7 +5286,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     fmov\\t%d0, %x1
     fmov\\t%x0, %d1
     fmov\\t%d0, %d1
-@@ -1231,8 +1223,9 @@
+@@ -1232,8 +1224,9 @@
     ldr\\t%x0, %1
     str\\t%x1, %0
     mov\\t%x0, %x1"
@@ -4669,7 +5298,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "*movtf_aarch64"
-@@ -1257,7 +1250,6 @@
+@@ -1258,7 +1251,6 @@
    [(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
                       f_loadd,f_stored,load2,store2,store2")
     (set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
@@ -4677,7 +5306,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (set_attr "simd" "yes,*,*,*,yes,*,*,*,*,*,*")]
  )
  
-@@ -1570,10 +1562,10 @@
+@@ -1571,10 +1563,10 @@
          (zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m")))]
    ""
    "@
@@ -4690,7 +5319,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_expand "<optab>qihi2"
-@@ -1582,16 +1574,26 @@
+@@ -1583,16 +1575,26 @@
    ""
  )
  
@@ -4721,7 +5350,39 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ;; -------------------------------------------------------------------
  ;; Simple arithmetic
  ;; -------------------------------------------------------------------
-@@ -1783,7 +1785,7 @@
+@@ -1604,25 +1606,12 @@
+ 	      (match_operand:GPI 2 "aarch64_pluslong_operand" "")))]
+   ""
+ {
+-  if (aarch64_pluslong_strict_immedate (operands[2], <MODE>mode))
+-    {
+-      /* Give CSE the opportunity to share this constant across additions.  */
+-      if (!cse_not_expected && can_create_pseudo_p ())
+-        operands[2] = force_reg (<MODE>mode, operands[2]);
+-
+-      /* Split will refuse to operate on a modification to the stack pointer.
+-	 Aid the prologue and epilogue expanders by splitting this now.  */
+-      else if (reload_completed && operands[0] == stack_pointer_rtx)
+-	{
+-	  HOST_WIDE_INT i = INTVAL (operands[2]);
+-	  HOST_WIDE_INT s = (i >= 0 ? i & 0xfff : -(-i & 0xfff));
+-	  emit_insn (gen_rtx_SET (operands[0],
+-				  gen_rtx_PLUS (<MODE>mode, operands[1],
+-						GEN_INT (i - s))));
+-	  operands[1] = operands[0];
+-	  operands[2] = GEN_INT (s);
+-	}
+-    }
++  /* If the constant is too large for a single instruction and isn't frame
++     based, split off the immediate so it is available for CSE.  */
++  if (!aarch64_plus_immediate (operands[2], <MODE>mode)
++      && can_create_pseudo_p ()
++      && !REGNO_PTR_FRAME_P (REGNO (operands[1])))
++    operands[2] = force_reg (<MODE>mode, operands[2]);
+ })
+ 
+ (define_insn "*add<mode>3_aarch64"
+@@ -1784,7 +1773,7 @@
    "aarch64_zero_extend_const_eq (<DWI>mode, operands[2],
  				 <MODE>mode, operands[1])"
    "@
@@ -4730,7 +5391,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    cmp\\t%<w>0, #%n1"
    [(set_attr "type" "alus_imm")]
  )
-@@ -1815,11 +1817,11 @@
+@@ -1816,11 +1805,11 @@
    "aarch64_zero_extend_const_eq (<DWI>mode, operands[3],
                                   <MODE>mode, operands[2])"
    "@
@@ -4744,7 +5405,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "add<mode>3_compareC"
    [(set (reg:CC_C CC_REGNUM)
  	(ne:CC_C
-@@ -3422,7 +3424,9 @@
+@@ -3423,7 +3412,9 @@
           (LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
  		     (match_operand:SI 2 "aarch64_logical_operand" "r,K"))))]
    ""
@@ -4755,7 +5416,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set_attr "type" "logic_reg,logic_imm")]
  )
  
-@@ -3435,7 +3439,9 @@
+@@ -3436,7 +3427,9 @@
     (set (match_operand:GPI 0 "register_operand" "=r,r")
  	(and:GPI (match_dup 1) (match_dup 2)))]
    ""
@@ -4766,7 +5427,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set_attr "type" "logics_reg,logics_imm")]
  )
  
-@@ -3449,7 +3455,9 @@
+@@ -3450,7 +3443,9 @@
     (set (match_operand:DI 0 "register_operand" "=r,r")
  	(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
    ""
@@ -4777,7 +5438,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set_attr "type" "logics_reg,logics_imm")]
  )
  
-@@ -3775,16 +3783,23 @@
+@@ -3776,16 +3771,23 @@
    [(set_attr "type" "rbit")]
  )
  
@@ -4810,7 +5471,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  (define_insn "*and<mode>_compare0"
    [(set (reg:CC_NZ CC_REGNUM)
-@@ -3796,6 +3811,18 @@
+@@ -3797,6 +3799,18 @@
    [(set_attr "type" "alus_imm")]
  )
  
@@ -4829,7 +5490,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*and<mode>3nr_compare0"
    [(set (reg:CC_NZ CC_REGNUM)
  	(compare:CC_NZ
-@@ -3803,7 +3830,9 @@
+@@ -3804,7 +3818,9 @@
  		  (match_operand:GPI 1 "aarch64_logical_operand" "r,<lconst>"))
  	 (const_int 0)))]
    ""
@@ -4840,7 +5501,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set_attr "type" "logics_reg,logics_imm")]
  )
  
-@@ -3869,22 +3898,16 @@
+@@ -3870,22 +3886,16 @@
  (define_expand "ashl<mode>3"
    [(set (match_operand:SHORT 0 "register_operand")
  	(ashift:SHORT (match_operand:SHORT 1 "register_operand")
@@ -4869,7 +5530,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    }
  )
  
-@@ -3933,33 +3956,35 @@
+@@ -3934,33 +3944,35 @@
  
  ;; Logical left shift using SISD or Integer instruction
  (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
@@ -4917,7 +5578,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_split
-@@ -3994,18 +4019,19 @@
+@@ -3995,18 +4007,19 @@
  
  ;; Arithmetic right shift using SISD or Integer instruction
  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
@@ -4942,7 +5603,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_split
-@@ -4097,21 +4123,25 @@
+@@ -4098,21 +4111,25 @@
    [(set (match_operand:GPI 0 "register_operand" "=r,r")
       (rotatert:GPI
         (match_operand:GPI 1 "register_operand" "r,r")
@@ -4976,7 +5637,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "*<optab><mode>3_insn"
-@@ -4135,7 +4165,7 @@
+@@ -4136,7 +4153,7 @@
    "UINTVAL (operands[3]) < GET_MODE_BITSIZE (<MODE>mode) &&
     (UINTVAL (operands[3]) + UINTVAL (operands[4]) == GET_MODE_BITSIZE (<MODE>mode))"
    "extr\\t%<w>0, %<w>1, %<w>2, %4"
@@ -4985,7 +5646,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; There are no canonicalisation rules for ashift and lshiftrt inside an ior
-@@ -4150,7 +4180,7 @@
+@@ -4151,7 +4168,7 @@
     && (UINTVAL (operands[3]) + UINTVAL (operands[4])
         == GET_MODE_BITSIZE (<MODE>mode))"
    "extr\\t%<w>0, %<w>1, %<w>2, %4"
@@ -4994,7 +5655,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; zero_extend version of the above
-@@ -4164,7 +4194,7 @@
+@@ -4165,7 +4182,7 @@
    "UINTVAL (operands[3]) < 32 &&
     (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
    "extr\\t%w0, %w1, %w2, %4"
@@ -5003,7 +5664,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "*extrsi5_insn_uxtw_alt"
-@@ -4177,7 +4207,7 @@
+@@ -4178,7 +4195,7 @@
    "UINTVAL (operands[3]) < 32 &&
     (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
    "extr\\t%w0, %w1, %w2, %4"
@@ -5012,7 +5673,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "*ror<mode>3_insn"
-@@ -4357,9 +4387,7 @@
+@@ -4367,9 +4384,7 @@
  	(and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand" "r")
  			     (match_operand 2 "const_int_operand" "n"))
  		 (match_operand 3 "const_int_operand" "n")))]
@@ -5023,7 +5684,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "ubfiz\\t%<w>0, %<w>1, %2, %P3"
    [(set_attr "type" "bfm")]
  )
-@@ -4429,22 +4457,23 @@
+@@ -4439,22 +4454,23 @@
  ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
  
  (define_insn "<frint_pattern><mode>2"
@@ -5054,7 +5715,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set_attr "type" "f_cvtf2i")]
  )
  
-@@ -4470,23 +4499,24 @@
+@@ -4480,23 +4496,24 @@
  ;; fma - no throw
  
  (define_insn "fma<mode>4"
@@ -5089,30 +5750,30 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "fms<mode>4"
-@@ -4572,19 +4602,11 @@
+@@ -4582,19 +4599,11 @@
    [(set_attr "type" "f_cvt")]
  )
  
 -(define_insn "fix_trunc<GPF:mode><GPI:mode>2"
-+(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
-   [(set (match_operand:GPI 0 "register_operand" "=r")
+-  [(set (match_operand:GPI 0 "register_operand" "=r")
 -        (fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
-+	(FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
-   "TARGET_FLOAT"
+-  "TARGET_FLOAT"
 -  "fcvtzs\\t%<GPI:w>0, %<GPF:s>1"
 -  [(set_attr "type" "f_cvtf2i")]
 -)
 -
 -(define_insn "fixuns_trunc<GPF:mode><GPI:mode>2"
--  [(set (match_operand:GPI 0 "register_operand" "=r")
++(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
+   [(set (match_operand:GPI 0 "register_operand" "=r")
 -        (unsigned_fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
--  "TARGET_FLOAT"
++	(FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
+   "TARGET_FLOAT"
 -  "fcvtzu\\t%<GPI:w>0, %<GPF:s>1"
 +  "fcvtz<su>\t%<GPI:w>0, %<GPF_F16:s>1"
    [(set_attr "type" "f_cvtf2i")]
  )
  
-@@ -4608,38 +4630,116 @@
+@@ -4618,38 +4627,116 @@
    [(set_attr "type" "f_cvti2f")]
  )
  
@@ -5244,7 +5905,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "*fnmul<mode>3"
-@@ -4662,38 +4762,58 @@
+@@ -4672,38 +4759,58 @@
    [(set_attr "type" "fmul<s>")]
  )
  
@@ -5319,7 +5980,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; Given that smax/smin do not specify the result when either input is NaN,
-@@ -4718,15 +4838,17 @@
+@@ -4728,15 +4835,17 @@
    [(set_attr "type" "f_minmax<s>")]
  )
  
@@ -5345,7 +6006,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; For copysign (x, y), we want to generate:
-@@ -5191,7 +5313,7 @@
+@@ -5201,7 +5310,7 @@
  	 UNSPEC_SP_TEST))
     (clobber (match_scratch:PTR 3 "=&r"))]
    ""
@@ -5968,7 +6629,52 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
 --- a/src/gcc/config/aarch64/arm_neon.h
 +++ b/src/gcc/config/aarch64/arm_neon.h
-@@ -466,6 +466,8 @@ typedef struct poly16x8x4_t
+@@ -58,6 +58,7 @@ typedef __Float64x2_t float64x2_t;
+ typedef __Poly8x16_t poly8x16_t;
+ typedef __Poly16x8_t poly16x8_t;
+ typedef __Poly64x2_t poly64x2_t;
++typedef __Poly64x1_t poly64x1_t;
+ typedef __Uint8x16_t uint8x16_t;
+ typedef __Uint16x8_t uint16x8_t;
+ typedef __Uint32x4_t uint32x4_t;
+@@ -202,6 +203,36 @@ typedef struct poly16x8x2_t
+   poly16x8_t val[2];
+ } poly16x8x2_t;
+ 
++typedef struct poly64x1x2_t
++{
++  poly64x1_t val[2];
++} poly64x1x2_t;
++
++typedef struct poly64x1x3_t
++{
++  poly64x1_t val[3];
++} poly64x1x3_t;
++
++typedef struct poly64x1x4_t
++{
++  poly64x1_t val[4];
++} poly64x1x4_t;
++
++typedef struct poly64x2x2_t
++{
++  poly64x2_t val[2];
++} poly64x2x2_t;
++
++typedef struct poly64x2x3_t
++{
++  poly64x2_t val[3];
++} poly64x2x3_t;
++
++typedef struct poly64x2x4_t
++{
++  poly64x2_t val[4];
++} poly64x2x4_t;
++
+ typedef struct int8x8x3_t
+ {
+   int8x8_t val[3];
+@@ -466,6 +497,8 @@ typedef struct poly16x8x4_t
  #define __aarch64_vdup_lane_any(__size, __q, __a, __b) \
    vdup##__q##_n_##__size (__aarch64_vget_lane_any (__a, __b))
  
@@ -5977,7 +6683,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #define __aarch64_vdup_lane_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, , __a, __b)
  #define __aarch64_vdup_lane_f64(__a, __b) \
-@@ -492,6 +494,8 @@ typedef struct poly16x8x4_t
+@@ -474,6 +507,8 @@ typedef struct poly16x8x4_t
+    __aarch64_vdup_lane_any (p8, , __a, __b)
+ #define __aarch64_vdup_lane_p16(__a, __b) \
+    __aarch64_vdup_lane_any (p16, , __a, __b)
++#define __aarch64_vdup_lane_p64(__a, __b) \
++   __aarch64_vdup_lane_any (p64, , __a, __b)
+ #define __aarch64_vdup_lane_s8(__a, __b) \
+    __aarch64_vdup_lane_any (s8, , __a, __b)
+ #define __aarch64_vdup_lane_s16(__a, __b) \
+@@ -492,6 +527,8 @@ typedef struct poly16x8x4_t
     __aarch64_vdup_lane_any (u64, , __a, __b)
  
  /* __aarch64_vdup_laneq internal macros.  */
@@ -5986,7 +6701,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #define __aarch64_vdup_laneq_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, , __a, __b)
  #define __aarch64_vdup_laneq_f64(__a, __b) \
-@@ -518,6 +522,8 @@ typedef struct poly16x8x4_t
+@@ -500,6 +537,8 @@ typedef struct poly16x8x4_t
+    __aarch64_vdup_lane_any (p8, , __a, __b)
+ #define __aarch64_vdup_laneq_p16(__a, __b) \
+    __aarch64_vdup_lane_any (p16, , __a, __b)
++#define __aarch64_vdup_laneq_p64(__a, __b) \
++   __aarch64_vdup_lane_any (p64, , __a, __b)
+ #define __aarch64_vdup_laneq_s8(__a, __b) \
+    __aarch64_vdup_lane_any (s8, , __a, __b)
+ #define __aarch64_vdup_laneq_s16(__a, __b) \
+@@ -518,6 +557,8 @@ typedef struct poly16x8x4_t
     __aarch64_vdup_lane_any (u64, , __a, __b)
  
  /* __aarch64_vdupq_lane internal macros.  */
@@ -5995,7 +6719,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #define __aarch64_vdupq_lane_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, q, __a, __b)
  #define __aarch64_vdupq_lane_f64(__a, __b) \
-@@ -544,6 +550,8 @@ typedef struct poly16x8x4_t
+@@ -526,6 +567,8 @@ typedef struct poly16x8x4_t
+    __aarch64_vdup_lane_any (p8, q, __a, __b)
+ #define __aarch64_vdupq_lane_p16(__a, __b) \
+    __aarch64_vdup_lane_any (p16, q, __a, __b)
++#define __aarch64_vdupq_lane_p64(__a, __b) \
++   __aarch64_vdup_lane_any (p64, q, __a, __b)
+ #define __aarch64_vdupq_lane_s8(__a, __b) \
+    __aarch64_vdup_lane_any (s8, q, __a, __b)
+ #define __aarch64_vdupq_lane_s16(__a, __b) \
+@@ -544,6 +587,8 @@ typedef struct poly16x8x4_t
     __aarch64_vdup_lane_any (u64, q, __a, __b)
  
  /* __aarch64_vdupq_laneq internal macros.  */
@@ -6004,7 +6737,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #define __aarch64_vdupq_laneq_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, q, __a, __b)
  #define __aarch64_vdupq_laneq_f64(__a, __b) \
-@@ -601,535 +609,619 @@ typedef struct poly16x8x4_t
+@@ -552,6 +597,8 @@ typedef struct poly16x8x4_t
+    __aarch64_vdup_lane_any (p8, q, __a, __b)
+ #define __aarch64_vdupq_laneq_p16(__a, __b) \
+    __aarch64_vdup_lane_any (p16, q, __a, __b)
++#define __aarch64_vdupq_laneq_p64(__a, __b) \
++   __aarch64_vdup_lane_any (p64, q, __a, __b)
+ #define __aarch64_vdupq_laneq_s8(__a, __b) \
+    __aarch64_vdup_lane_any (s8, q, __a, __b)
+ #define __aarch64_vdupq_laneq_s16(__a, __b) \
+@@ -601,535 +648,619 @@ typedef struct poly16x8x4_t
    })
  
  /* vadd  */
@@ -6708,7 +7450,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  {
    return (uint8x16_t) __builtin_aarch64_addhn2v8hi ((int8x8_t) __a,
-@@ -1137,7 +1229,8 @@ vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+@@ -1137,7 +1268,8 @@ vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  						    (int16x8_t) __c);
  }
  
@@ -6718,7 +7460,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  {
    return (uint16x8_t) __builtin_aarch64_addhn2v4si ((int16x4_t) __a,
-@@ -1145,7 +1238,8 @@ vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+@@ -1145,7 +1277,8 @@ vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  						    (int32x4_t) __c);
  }
  
@@ -6728,7 +7470,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  {
    return (uint32x4_t) __builtin_aarch64_addhn2v2di ((int32x2_t) __a,
-@@ -1153,25 +1247,29 @@ vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+@@ -1153,25 +1286,29 @@ vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  						    (int64x2_t) __c);
  }
  
@@ -6762,7 +7504,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  {
    return (uint8x16_t) __builtin_aarch64_raddhn2v8hi ((int8x8_t) __a,
-@@ -1179,7 +1277,8 @@ vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+@@ -1179,7 +1316,8 @@ vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  						     (int16x8_t) __c);
  }
  
@@ -6772,7 +7514,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  {
    return (uint16x8_t) __builtin_aarch64_raddhn2v4si ((int16x4_t) __a,
-@@ -1187,7 +1286,8 @@ vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+@@ -1187,7 +1325,8 @@ vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  						     (int32x4_t) __c);
  }
  
@@ -6782,7 +7524,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  {
    return (uint32x4_t) __builtin_aarch64_raddhn2v2di ((int32x2_t) __a,
-@@ -1195,1101 +1295,1280 @@ vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+@@ -1195,1101 +1334,1280 @@ vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  						     (int64x2_t) __c);
  }
  
@@ -8242,7 +8984,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  {
    return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
-@@ -2297,7 +2576,8 @@ vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+@@ -2297,7 +2615,8 @@ vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  						     (int16x8_t) __c);
  }
  
@@ -8252,7 +8994,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  {
    return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
-@@ -2305,7 +2585,8 @@ vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+@@ -2305,7 +2624,8 @@ vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  						     (int32x4_t) __c);
  }
  
@@ -8262,7 +9004,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  {
    return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
-@@ -2313,25 +2594,29 @@ vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+@@ -2313,25 +2633,29 @@ vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  						     (int64x2_t) __c);
  }
  
@@ -8296,7 +9038,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  {
    return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
-@@ -2339,7 +2624,8 @@ vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+@@ -2339,7 +2663,8 @@ vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
  						    (int16x8_t) __c);
  }
  
@@ -8306,7 +9048,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  {
    return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
-@@ -2347,7 +2633,8 @@ vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+@@ -2347,7 +2672,8 @@ vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
  						    (int32x4_t) __c);
  }
  
@@ -8316,7 +9058,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  {
    return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
-@@ -2355,373 +2642,435 @@ vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+@@ -2355,453 +2681,542 @@ vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
  						    (int64x2_t) __c);
  }
  
@@ -8814,8 +9556,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vcreate_p16 (uint64_t __a)
  {
    return (poly16x4_t) __a;
-@@ -2729,79 +3078,92 @@ vcreate_p16 (uint64_t __a)
+ }
  
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcreate_p64 (uint64_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
  /* vget_lane  */
  
 -__extension__ static __inline float16_t __attribute__ ((__always_inline__))
@@ -8859,6 +9608,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vget_lane_p64 (poly64x1_t __a, const int __b)
++{
++  return __aarch64_vget_lane_any (__a, __b);
++}
++
 +__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vget_lane_s8 (int8x8_t __a, const int __b)
@@ -8920,7 +9676,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vget_lane_u64 (uint64x1_t __a, const int __b)
  {
    return __aarch64_vget_lane_any (__a, __b);
-@@ -2809,79 +3171,92 @@ vget_lane_u64 (uint64x1_t __a, const int __b)
+@@ -2809,79 +3224,99 @@ vget_lane_u64 (uint64x1_t __a, const int __b)
  
  /* vgetq_lane  */
  
@@ -8965,6 +9721,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vgetq_lane_p64 (poly64x2_t __a, const int __b)
++{
++  return __aarch64_vget_lane_any (__a, __b);
++}
++
 +__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vgetq_lane_s8 (int8x16_t __a, const int __b)
@@ -9026,7 +9789,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vgetq_lane_u64 (uint64x2_t __a, const int __b)
  {
    return __aarch64_vget_lane_any (__a, __b);
-@@ -2889,1873 +3264,2185 @@ vgetq_lane_u64 (uint64x2_t __a, const int __b)
+@@ -2889,1953 +3324,2832 @@ vgetq_lane_u64 (uint64x2_t __a, const int __b)
  
  /* vreinterpret  */
  
@@ -9127,6 +9890,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p8_p64 (poly64x1_t __a)
++{
++  return (poly8x8_t) __a;
++}
++
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_p8_f64 (float64x2_t __a)
@@ -9223,6 +9993,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p8_p64 (poly64x2_t __a)
++{
++  return (poly8x16_t) __a;
++}
++
++__extension__ extern __inline poly8x16_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p8_p128 (poly128_t __a)
++{
++  return (poly8x16_t)__a;
++}
++
 +__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_p16_f16 (float16x4_t __a)
@@ -9319,6 +10103,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p16_p64 (poly64x1_t __a)
++{
++  return (poly16x4_t) __a;
++}
++
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_p16_f64 (float64x2_t __a)
@@ -9415,6 +10206,300 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p16_p64 (poly64x2_t __a)
++{
++  return (poly16x8_t) __a;
++}
++
++__extension__ extern __inline poly16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p16_p128 (poly128_t __a)
++{
++  return (poly16x8_t)__a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_f16 (float16x4_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_f64 (float64x1_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_s8 (int8x8_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_s16 (int16x4_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_s32 (int32x2_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_s64 (int64x1_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_f32 (float32x2_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_u8 (uint8x8_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_u16 (uint16x4_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_u32 (uint32x2_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_u64 (uint64x1_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_p8 (poly8x8_t __a)
++{
++  return (poly64x1_t) __a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_p16 (poly16x4_t __a)
++{
++  return (poly64x1_t)__a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_f64 (float64x2_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_s8 (int8x16_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_s16 (int16x8_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_s32 (int32x4_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_s64 (int64x2_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_f16 (float16x8_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_f32 (float32x4_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_p128 (poly128_t __a)
++{
++  return (poly64x2_t)__a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_u8 (uint8x16_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_u16 (uint16x8_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_p16 (poly16x8_t __a)
++{
++  return (poly64x2_t)__a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_u32 (uint32x4_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_u64 (uint64x2_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_p8 (poly8x16_t __a)
++{
++  return (poly64x2_t) __a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_p8 (poly8x16_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_p16 (poly16x8_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_f16 (float16x8_t __a)
++{
++  return (poly128_t) __a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_f32 (float32x4_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_p64 (poly64x2_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_s64 (int64x2_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_u64 (uint64x2_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_s8 (int8x16_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_s16 (int16x8_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_s32 (int32x4_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_u8 (uint8x16_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_u16 (uint16x8_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_u32 (uint32x4_t __a)
++{
++  return (poly128_t)__a;
++}
++
 +__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_f16_f64 (float64x1_t __a)
@@ -9511,6 +10596,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_f16_p64 (poly64x1_t __a)
++{
++  return (float16x4_t) __a;
++}
++
 +__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_f16_f64 (float64x2_t __a)
@@ -9600,6 +10692,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 +__extension__ extern __inline float16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f16_p128 (poly128_t __a)
++{
++  return (float16x8_t) __a;
++}
++
++__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_f16_p16 (poly16x8_t __a)
  {
@@ -9607,6 +10706,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f16_p64 (poly64x2_t __a)
++{
++  return (float16x8_t) __a;
++}
++
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_f32_f16 (float16x4_t __a)
@@ -9703,6 +10809,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_f32_p64 (poly64x1_t __a)
++{
++  return (float32x2_t) __a;
++}
++
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_f32_f16 (float16x8_t __a)
@@ -9799,6 +10912,21 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline float64x1_t __attribute__((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f32_p64 (poly64x2_t __a)
++{
++  return (float32x4_t) __a;
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f32_p128 (poly128_t __a)
++{
++  return (float32x4_t)__a;
++}
++
++
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_f64_f16 (float16x4_t __a)
@@ -9833,6 +10961,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__extension__ static __inline float64x1_t __attribute__((__always_inline__))
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_f64_p64 (poly64x1_t __a)
++{
++  return (float64x1_t) __a;
++}
++
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_f64_s8 (int8x8_t __a)
  {
    return (float64x1_t) __a;
@@ -9929,6 +11064,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__extension__ static __inline float64x2_t __attribute__((__always_inline__))
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f64_p64 (poly64x2_t __a)
++{
++  return (float64x2_t) __a;
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_f64_s8 (int8x16_t __a)
  {
    return (float64x2_t) __a;
@@ -10087,6 +11229,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s64_p64 (poly64x1_t __a)
++{
++  return (int64x1_t) __a;
++}
++
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_s64_f64 (float64x2_t __a)
@@ -10183,6 +11332,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s64_p64 (poly64x2_t __a)
++{
++  return (int64x2_t) __a;
++}
++
++__extension__ extern __inline int64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s64_p128 (poly128_t __a)
++{
++  return (int64x2_t)__a;
++}
++
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_u64_f16 (float16x4_t __a)
@@ -10279,6 +11442,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u64_p64 (poly64x1_t __a)
++{
++  return (uint64x1_t) __a;
++}
++
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_u64_f64 (float64x2_t __a)
@@ -10375,6 +11545,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u64_p64 (poly64x2_t __a)
++{
++  return (uint64x2_t) __a;
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u64_p128 (poly128_t __a)
++{
++  return (uint64x2_t)__a;
++}
++
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_s8_f16 (float16x4_t __a)
@@ -10471,6 +11655,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s8_p64 (poly64x1_t __a)
++{
++  return (int8x8_t) __a;
++}
++
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_s8_f64 (float64x2_t __a)
@@ -10567,6 +11758,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s8_p64 (poly64x2_t __a)
++{
++  return (int8x16_t) __a;
++}
++
++__extension__ extern __inline int8x16_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s8_p128 (poly128_t __a)
++{
++  return (int8x16_t)__a;
++}
++
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_s16_f16 (float16x4_t __a)
@@ -10663,6 +11868,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s16_p64 (poly64x1_t __a)
++{
++  return (int16x4_t) __a;
++}
++
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_s16_f64 (float64x2_t __a)
@@ -10759,6 +11971,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s16_p64 (poly64x2_t __a)
++{
++  return (int16x8_t) __a;
++}
++
++__extension__ extern __inline int16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s16_p128 (poly128_t __a)
++{
++  return (int16x8_t)__a;
++}
++
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_s32_f16 (float16x4_t __a)
@@ -10855,6 +12081,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s32_p64 (poly64x1_t __a)
++{
++  return (int32x2_t) __a;
++}
++
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_s32_f64 (float64x2_t __a)
@@ -10951,6 +12184,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s32_p64 (poly64x2_t __a)
++{
++  return (int32x4_t) __a;
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s32_p128 (poly128_t __a)
++{
++  return (int32x4_t)__a;
++}
++
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_u8_f16 (float16x4_t __a)
@@ -11047,6 +12294,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u8_p64 (poly64x1_t __a)
++{
++  return (uint8x8_t) __a;
++}
++
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_u8_f64 (float64x2_t __a)
@@ -11143,6 +12397,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u8_p64 (poly64x2_t __a)
++{
++  return (uint8x16_t) __a;
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u8_p128 (poly128_t __a)
++{
++  return (uint8x16_t)__a;
++}
++
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_u16_f16 (float16x4_t __a)
@@ -11239,6 +12507,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u16_p64 (poly64x1_t __a)
++{
++  return (uint16x4_t) __a;
++}
++
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_u16_f64 (float64x2_t __a)
@@ -11335,6 +12610,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u16_p64 (poly64x2_t __a)
++{
++  return (uint16x8_t) __a;
++}
++
++__extension__ extern __inline uint16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u16_p128 (poly128_t __a)
++{
++  return (uint16x8_t)__a;
++}
++
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpret_u32_f16 (float16x4_t __a)
@@ -11431,6 +12720,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u32_p64 (poly64x1_t __a)
++{
++  return (uint32x2_t) __a;
++}
++
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vreinterpretq_u32_f64 (float64x2_t __a)
@@ -11524,8 +12820,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vreinterpretq_u32_p16 (poly16x8_t __a)
  {
    return (uint32x4_t) __a;
-@@ -4763,79 +5450,92 @@ vreinterpretq_u32_p16 (poly16x8_t __a)
+ }
  
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u32_p64 (poly64x2_t __a)
++{
++  return (uint32x4_t) __a;
++}
++
++__extension__ extern __inline uint32x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u32_p128 (poly128_t __a)
++{
++  return (uint32x4_t)__a;
++}
++
  /* vset_lane  */
  
 -__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
@@ -11569,6 +12879,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vset_lane_p64 (poly64_t __elem, poly64x1_t __vec, const int __index)
++{
++  return __aarch64_vset_lane_any (__elem, __vec, __index);
++}
++
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vset_lane_s8 (int8_t __elem, int8x8_t __vec, const int __index)
@@ -11630,7 +12947,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
  {
    return __aarch64_vset_lane_any (__elem, __vec, __index);
-@@ -4843,79 +5543,92 @@ vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
+@@ -4843,79 +6157,99 @@ vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
  
  /* vsetq_lane  */
  
@@ -11675,6 +12992,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsetq_lane_p64 (poly64_t __elem, poly64x2_t __vec, const int __index)
++{
++  return __aarch64_vset_lane_any (__elem, __vec, __index);
++}
++
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vsetq_lane_s8 (int8_t __elem, int8x16_t __vec, const int __index)
@@ -11736,7 +13060,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
  {
    return __aarch64_vset_lane_any (__elem, __vec, __index);
-@@ -4926,79 +5639,92 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
+@@ -4926,79 +6260,99 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
    uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
    return vreinterpret_##__TYPE##_u64 (lo);
  
@@ -11781,6 +13105,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vget_low_p64 (poly64x2_t __a)
++{
++  __GET_LOW (p64);
++}
++
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vget_low_s8 (int8x16_t __a)
@@ -11842,7 +13173,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vget_low_u64 (uint64x2_t __a)
  {
    return vcreate_u64 (vgetq_lane_u64 (__a, 0));
-@@ -5011,73 +5737,85 @@ vget_low_u64 (uint64x2_t __a)
+@@ -5011,73 +6365,92 @@ vget_low_u64 (uint64x2_t __a)
    uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
    return vreinterpret_##__TYPE##_u64 (hi);
  
@@ -11887,6 +13218,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vget_high_p64 (poly64x2_t __a)
++{
++  __GET_HIGH (p64);
++}
++
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  vget_high_s8 (int8x16_t __a)
@@ -11940,7 +13278,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vget_high_u32 (uint32x4_t __a)
  {
    __GET_HIGH (u32);
-@@ -5085,89 +5823,103 @@ vget_high_u32 (uint32x4_t __a)
+@@ -5085,98 +6458,120 @@ vget_high_u32 (uint32x4_t __a)
  
  #undef __GET_HIGH
  
@@ -12058,8 +13396,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
  {
    return (poly16x8_t) __builtin_aarch64_combinev4hi ((int16x4_t) __a,
-@@ -5176,7 +5928,8 @@ vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
+ 						     (int16x4_t) __b);
+ }
  
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
++{
++  return (poly64x2_t) __builtin_aarch64_combinedi_ppp (__a[0], __b[0]);
++}
++
  /* Start of temporary inline asm implementations.  */
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
@@ -12068,7 +13414,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
  {
    int8x8_t result;
-@@ -5187,7 +5940,8 @@ vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+@@ -5187,7 +6582,8 @@ vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
    return result;
  }
  
@@ -12078,7 +13424,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
  {
    int16x4_t result;
-@@ -5198,7 +5952,8 @@ vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+@@ -5198,7 +6594,8 @@ vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
    return result;
  }
  
@@ -12088,7 +13434,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
  {
    int32x2_t result;
-@@ -5209,7 +5964,8 @@ vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+@@ -5209,7 +6606,8 @@ vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
    return result;
  }
  
@@ -12098,7 +13444,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
  {
    uint8x8_t result;
-@@ -5220,7 +5976,8 @@ vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+@@ -5220,7 +6618,8 @@ vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
    return result;
  }
  
@@ -12108,7 +13454,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
  {
    uint16x4_t result;
-@@ -5231,7 +5988,8 @@ vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+@@ -5231,7 +6630,8 @@ vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
    return result;
  }
  
@@ -12118,7 +13464,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
  {
    uint32x2_t result;
-@@ -5242,7 +6000,8 @@ vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+@@ -5242,7 +6642,8 @@ vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
    return result;
  }
  
@@ -12128,7 +13474,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
  {
    int16x8_t result;
-@@ -5253,7 +6012,8 @@ vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+@@ -5253,7 +6654,8 @@ vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
    return result;
  }
  
@@ -12138,7 +13484,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
  {
    int32x4_t result;
-@@ -5264,7 +6024,8 @@ vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+@@ -5264,7 +6666,8 @@ vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
    return result;
  }
  
@@ -12148,7 +13494,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
  {
    int64x2_t result;
-@@ -5275,7 +6036,8 @@ vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+@@ -5275,7 +6678,8 @@ vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
    return result;
  }
  
@@ -12158,7 +13504,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
  {
    uint16x8_t result;
-@@ -5286,7 +6048,8 @@ vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+@@ -5286,7 +6690,8 @@ vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
    return result;
  }
  
@@ -12168,7 +13514,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
  {
    uint32x4_t result;
-@@ -5297,7 +6060,8 @@ vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+@@ -5297,7 +6702,8 @@ vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
    return result;
  }
  
@@ -12178,7 +13524,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
  {
    uint64x2_t result;
-@@ -5308,7 +6072,8 @@ vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+@@ -5308,7 +6714,8 @@ vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
    return result;
  }
  
@@ -12188,7 +13534,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
  {
    int16x8_t result;
-@@ -5319,7 +6084,8 @@ vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+@@ -5319,7 +6726,8 @@ vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
    return result;
  }
  
@@ -12198,7 +13544,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
  {
    int32x4_t result;
-@@ -5330,7 +6096,8 @@ vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+@@ -5330,7 +6738,8 @@ vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
    return result;
  }
  
@@ -12208,7 +13554,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
  {
    int64x2_t result;
-@@ -5341,7 +6108,8 @@ vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+@@ -5341,7 +6750,8 @@ vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
    return result;
  }
  
@@ -12218,7 +13564,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
  {
    uint16x8_t result;
-@@ -5352,7 +6120,8 @@ vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+@@ -5352,7 +6762,8 @@ vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
    return result;
  }
  
@@ -12228,7 +13574,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
  {
    uint32x4_t result;
-@@ -5363,7 +6132,8 @@ vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+@@ -5363,7 +6774,8 @@ vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
    return result;
  }
  
@@ -12238,7 +13584,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
  {
    uint64x2_t result;
-@@ -5374,7 +6144,8 @@ vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+@@ -5374,7 +6786,8 @@ vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
    return result;
  }
  
@@ -12248,7 +13594,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
  {
    int8x16_t result;
-@@ -5385,7 +6156,8 @@ vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+@@ -5385,7 +6798,8 @@ vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
    return result;
  }
  
@@ -12258,7 +13604,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
  {
    int16x8_t result;
-@@ -5396,7 +6168,8 @@ vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
+@@ -5396,7 +6810,8 @@ vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
    return result;
  }
  
@@ -12268,7 +13614,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
  {
    int32x4_t result;
-@@ -5407,7 +6180,8 @@ vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+@@ -5407,7 +6822,8 @@ vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
    return result;
  }
  
@@ -12278,7 +13624,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
  {
    uint8x16_t result;
-@@ -5418,7 +6192,8 @@ vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+@@ -5418,7 +6834,8 @@ vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
    return result;
  }
  
@@ -12288,7 +13634,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
  {
    uint16x8_t result;
-@@ -5429,7 +6204,8 @@ vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+@@ -5429,7 +6846,8 @@ vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
    return result;
  }
  
@@ -12298,7 +13644,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
  {
    uint32x4_t result;
-@@ -5440,18 +6216,8 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+@@ -5440,18 +6858,8 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
    return result;
  }
  
@@ -12319,7 +13665,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabd_s8 (int8x8_t a, int8x8_t b)
  {
    int8x8_t result;
-@@ -5462,7 +6228,8 @@ vabd_s8 (int8x8_t a, int8x8_t b)
+@@ -5462,7 +6870,8 @@ vabd_s8 (int8x8_t a, int8x8_t b)
    return result;
  }
  
@@ -12329,7 +13675,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabd_s16 (int16x4_t a, int16x4_t b)
  {
    int16x4_t result;
-@@ -5473,7 +6240,8 @@ vabd_s16 (int16x4_t a, int16x4_t b)
+@@ -5473,7 +6882,8 @@ vabd_s16 (int16x4_t a, int16x4_t b)
    return result;
  }
  
@@ -12339,7 +13685,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabd_s32 (int32x2_t a, int32x2_t b)
  {
    int32x2_t result;
-@@ -5484,7 +6252,8 @@ vabd_s32 (int32x2_t a, int32x2_t b)
+@@ -5484,7 +6894,8 @@ vabd_s32 (int32x2_t a, int32x2_t b)
    return result;
  }
  
@@ -12349,7 +13695,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabd_u8 (uint8x8_t a, uint8x8_t b)
  {
    uint8x8_t result;
-@@ -5495,7 +6264,8 @@ vabd_u8 (uint8x8_t a, uint8x8_t b)
+@@ -5495,7 +6906,8 @@ vabd_u8 (uint8x8_t a, uint8x8_t b)
    return result;
  }
  
@@ -12359,7 +13705,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabd_u16 (uint16x4_t a, uint16x4_t b)
  {
    uint16x4_t result;
-@@ -5506,7 +6276,8 @@ vabd_u16 (uint16x4_t a, uint16x4_t b)
+@@ -5506,7 +6918,8 @@ vabd_u16 (uint16x4_t a, uint16x4_t b)
    return result;
  }
  
@@ -12369,7 +13715,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabd_u32 (uint32x2_t a, uint32x2_t b)
  {
    uint32x2_t result;
-@@ -5517,18 +6288,8 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
+@@ -5517,18 +6930,8 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
    return result;
  }
  
@@ -12390,7 +13736,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_high_s8 (int8x16_t a, int8x16_t b)
  {
    int16x8_t result;
-@@ -5539,7 +6300,8 @@ vabdl_high_s8 (int8x16_t a, int8x16_t b)
+@@ -5539,7 +6942,8 @@ vabdl_high_s8 (int8x16_t a, int8x16_t b)
    return result;
  }
  
@@ -12400,7 +13746,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_high_s16 (int16x8_t a, int16x8_t b)
  {
    int32x4_t result;
-@@ -5550,7 +6312,8 @@ vabdl_high_s16 (int16x8_t a, int16x8_t b)
+@@ -5550,7 +6954,8 @@ vabdl_high_s16 (int16x8_t a, int16x8_t b)
    return result;
  }
  
@@ -12410,7 +13756,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_high_s32 (int32x4_t a, int32x4_t b)
  {
    int64x2_t result;
-@@ -5561,7 +6324,8 @@ vabdl_high_s32 (int32x4_t a, int32x4_t b)
+@@ -5561,7 +6966,8 @@ vabdl_high_s32 (int32x4_t a, int32x4_t b)
    return result;
  }
  
@@ -12420,7 +13766,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
  {
    uint16x8_t result;
-@@ -5572,7 +6336,8 @@ vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
+@@ -5572,7 +6978,8 @@ vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
    return result;
  }
  
@@ -12430,7 +13776,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
  {
    uint32x4_t result;
-@@ -5583,7 +6348,8 @@ vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
+@@ -5583,7 +6990,8 @@ vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
    return result;
  }
  
@@ -12440,7 +13786,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
  {
    uint64x2_t result;
-@@ -5594,7 +6360,8 @@ vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
+@@ -5594,7 +7002,8 @@ vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
    return result;
  }
  
@@ -12450,7 +13796,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_s8 (int8x8_t a, int8x8_t b)
  {
    int16x8_t result;
-@@ -5605,7 +6372,8 @@ vabdl_s8 (int8x8_t a, int8x8_t b)
+@@ -5605,7 +7014,8 @@ vabdl_s8 (int8x8_t a, int8x8_t b)
    return result;
  }
  
@@ -12460,7 +13806,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_s16 (int16x4_t a, int16x4_t b)
  {
    int32x4_t result;
-@@ -5616,7 +6384,8 @@ vabdl_s16 (int16x4_t a, int16x4_t b)
+@@ -5616,7 +7026,8 @@ vabdl_s16 (int16x4_t a, int16x4_t b)
    return result;
  }
  
@@ -12470,7 +13816,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_s32 (int32x2_t a, int32x2_t b)
  {
    int64x2_t result;
-@@ -5627,7 +6396,8 @@ vabdl_s32 (int32x2_t a, int32x2_t b)
+@@ -5627,7 +7038,8 @@ vabdl_s32 (int32x2_t a, int32x2_t b)
    return result;
  }
  
@@ -12480,7 +13826,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_u8 (uint8x8_t a, uint8x8_t b)
  {
    uint16x8_t result;
-@@ -5638,7 +6408,8 @@ vabdl_u8 (uint8x8_t a, uint8x8_t b)
+@@ -5638,7 +7050,8 @@ vabdl_u8 (uint8x8_t a, uint8x8_t b)
    return result;
  }
  
@@ -12490,7 +13836,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_u16 (uint16x4_t a, uint16x4_t b)
  {
    uint32x4_t result;
-@@ -5649,7 +6420,8 @@ vabdl_u16 (uint16x4_t a, uint16x4_t b)
+@@ -5649,7 +7062,8 @@ vabdl_u16 (uint16x4_t a, uint16x4_t b)
    return result;
  }
  
@@ -12500,7 +13846,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdl_u32 (uint32x2_t a, uint32x2_t b)
  {
    uint64x2_t result;
-@@ -5660,29 +6432,8 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
+@@ -5660,29 +7074,8 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
    return result;
  }
  
@@ -12532,7 +13878,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdq_s8 (int8x16_t a, int8x16_t b)
  {
    int8x16_t result;
-@@ -5693,7 +6444,8 @@ vabdq_s8 (int8x16_t a, int8x16_t b)
+@@ -5693,7 +7086,8 @@ vabdq_s8 (int8x16_t a, int8x16_t b)
    return result;
  }
  
@@ -12542,7 +13888,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdq_s16 (int16x8_t a, int16x8_t b)
  {
    int16x8_t result;
-@@ -5704,7 +6456,8 @@ vabdq_s16 (int16x8_t a, int16x8_t b)
+@@ -5704,7 +7098,8 @@ vabdq_s16 (int16x8_t a, int16x8_t b)
    return result;
  }
  
@@ -12552,7 +13898,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdq_s32 (int32x4_t a, int32x4_t b)
  {
    int32x4_t result;
-@@ -5715,7 +6468,8 @@ vabdq_s32 (int32x4_t a, int32x4_t b)
+@@ -5715,7 +7110,8 @@ vabdq_s32 (int32x4_t a, int32x4_t b)
    return result;
  }
  
@@ -12562,7 +13908,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdq_u8 (uint8x16_t a, uint8x16_t b)
  {
    uint8x16_t result;
-@@ -5726,7 +6480,8 @@ vabdq_u8 (uint8x16_t a, uint8x16_t b)
+@@ -5726,7 +7122,8 @@ vabdq_u8 (uint8x16_t a, uint8x16_t b)
    return result;
  }
  
@@ -12572,7 +13918,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdq_u16 (uint16x8_t a, uint16x8_t b)
  {
    uint16x8_t result;
-@@ -5737,7 +6492,8 @@ vabdq_u16 (uint16x8_t a, uint16x8_t b)
+@@ -5737,7 +7134,8 @@ vabdq_u16 (uint16x8_t a, uint16x8_t b)
    return result;
  }
  
@@ -12582,7 +13928,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vabdq_u32 (uint32x4_t a, uint32x4_t b)
  {
    uint32x4_t result;
-@@ -5748,18 +6504,8 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
+@@ -5748,18 +7146,8 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
    return result;
  }
  
@@ -12603,7 +13949,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlv_s8 (int8x8_t a)
  {
    int16_t result;
-@@ -5770,7 +6516,8 @@ vaddlv_s8 (int8x8_t a)
+@@ -5770,7 +7158,8 @@ vaddlv_s8 (int8x8_t a)
    return result;
  }
  
@@ -12613,7 +13959,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlv_s16 (int16x4_t a)
  {
    int32_t result;
-@@ -5781,7 +6528,8 @@ vaddlv_s16 (int16x4_t a)
+@@ -5781,7 +7170,8 @@ vaddlv_s16 (int16x4_t a)
    return result;
  }
  
@@ -12623,7 +13969,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlv_u8 (uint8x8_t a)
  {
    uint16_t result;
-@@ -5792,7 +6540,8 @@ vaddlv_u8 (uint8x8_t a)
+@@ -5792,7 +7182,8 @@ vaddlv_u8 (uint8x8_t a)
    return result;
  }
  
@@ -12633,7 +13979,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlv_u16 (uint16x4_t a)
  {
    uint32_t result;
-@@ -5803,7 +6552,8 @@ vaddlv_u16 (uint16x4_t a)
+@@ -5803,7 +7194,8 @@ vaddlv_u16 (uint16x4_t a)
    return result;
  }
  
@@ -12643,7 +13989,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlvq_s8 (int8x16_t a)
  {
    int16_t result;
-@@ -5814,7 +6564,8 @@ vaddlvq_s8 (int8x16_t a)
+@@ -5814,7 +7206,8 @@ vaddlvq_s8 (int8x16_t a)
    return result;
  }
  
@@ -12653,7 +13999,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlvq_s16 (int16x8_t a)
  {
    int32_t result;
-@@ -5825,7 +6576,8 @@ vaddlvq_s16 (int16x8_t a)
+@@ -5825,7 +7218,8 @@ vaddlvq_s16 (int16x8_t a)
    return result;
  }
  
@@ -12663,7 +14009,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlvq_s32 (int32x4_t a)
  {
    int64_t result;
-@@ -5836,7 +6588,8 @@ vaddlvq_s32 (int32x4_t a)
+@@ -5836,7 +7230,8 @@ vaddlvq_s32 (int32x4_t a)
    return result;
  }
  
@@ -12673,7 +14019,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlvq_u8 (uint8x16_t a)
  {
    uint16_t result;
-@@ -5847,7 +6600,8 @@ vaddlvq_u8 (uint8x16_t a)
+@@ -5847,7 +7242,8 @@ vaddlvq_u8 (uint8x16_t a)
    return result;
  }
  
@@ -12683,7 +14029,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlvq_u16 (uint16x8_t a)
  {
    uint32_t result;
-@@ -5858,7 +6612,8 @@ vaddlvq_u16 (uint16x8_t a)
+@@ -5858,7 +7254,8 @@ vaddlvq_u16 (uint16x8_t a)
    return result;
  }
  
@@ -12693,7 +14039,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  vaddlvq_u32 (uint32x4_t a)
  {
    uint64_t result;
-@@ -5869,18584 +6624,22583 @@ vaddlvq_u32 (uint32x4_t a)
+@@ -5869,18584 +7266,23100 @@ vaddlvq_u32 (uint32x4_t a)
    return result;
  }
  
@@ -12720,19 +14066,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
- 
--#define vcopyq_lane_f64(a, b, c, d)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       float64x2_t c_ = (c);                                            \
--       float64x2_t a_ = (a);                                            \
--       float64x2_t result;                                              \
--       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
--                : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
++
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
@@ -12890,14 +14224,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcopyq_lane_p8(a, b, c, d)                                      \
+-#define vcopyq_lane_f64(a, b, c, d)                                     \
 +#define vmlal_high_lane_s16(a, b, c, d)                                 \
    __extension__                                                         \
      ({                                                                  \
--       poly8x16_t c_ = (c);                                             \
--       poly8x16_t a_ = (a);                                             \
--       poly8x16_t result;                                               \
--       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
+-       float64x2_t c_ = (c);                                            \
+-       float64x2_t a_ = (a);                                            \
+-       float64x2_t result;                                              \
+-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
 +       int16x4_t c_ = (c);                                              \
 +       int16x8_t b_ = (b);                                              \
 +       int32x4_t a_ = (a);                                              \
@@ -12910,14 +14244,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vcopyq_lane_p16(a, b, c, d)                                     \
+-#define vcopyq_lane_p8(a, b, c, d)                                      \
 +#define vmlal_high_lane_s32(a, b, c, d)                                 \
    __extension__                                                         \
      ({                                                                  \
--       poly16x8_t c_ = (c);                                             \
--       poly16x8_t a_ = (a);                                             \
--       poly16x8_t result;                                               \
--       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
+-       poly8x16_t c_ = (c);                                             \
+-       poly8x16_t a_ = (a);                                             \
+-       poly8x16_t result;                                               \
+-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
 +       int32x2_t c_ = (c);                                              \
 +       int32x4_t b_ = (b);                                              \
 +       int64x2_t a_ = (a);                                              \
@@ -12930,14 +14264,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vcopyq_lane_s8(a, b, c, d)                                      \
+-#define vcopyq_lane_p16(a, b, c, d)                                     \
 +#define vmlal_high_lane_u16(a, b, c, d)                                 \
    __extension__                                                         \
      ({                                                                  \
--       int8x16_t c_ = (c);                                              \
--       int8x16_t a_ = (a);                                              \
--       int8x16_t result;                                                \
--       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
+-       poly16x8_t c_ = (c);                                             \
+-       poly16x8_t a_ = (a);                                             \
+-       poly16x8_t result;                                               \
+-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
 +       uint16x4_t c_ = (c);                                             \
 +       uint16x8_t b_ = (b);                                             \
 +       uint32x4_t a_ = (a);                                             \
@@ -12950,14 +14284,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vcopyq_lane_s16(a, b, c, d)                                     \
+-#define vcopyq_lane_s8(a, b, c, d)                                      \
 +#define vmlal_high_lane_u32(a, b, c, d)                                 \
    __extension__                                                         \
      ({                                                                  \
--       int16x8_t c_ = (c);                                              \
--       int16x8_t a_ = (a);                                              \
--       int16x8_t result;                                                \
--       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
+-       int8x16_t c_ = (c);                                              \
+-       int8x16_t a_ = (a);                                              \
+-       int8x16_t result;                                                \
+-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
 +       uint32x2_t c_ = (c);                                             \
 +       uint32x4_t b_ = (b);                                             \
 +       uint64x2_t a_ = (a);                                             \
@@ -12970,12 +14304,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vcopyq_lane_s32(a, b, c, d)                                     \
+-#define vcopyq_lane_s16(a, b, c, d)                                     \
 +#define vmlal_high_laneq_s16(a, b, c, d)                                \
    __extension__                                                         \
      ({                                                                  \
+        int16x8_t c_ = (c);                                              \
+-       int16x8_t a_ = (a);                                              \
+-       int16x8_t result;                                                \
+-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcopyq_lane_s32(a, b, c, d)                                     \
+-  __extension__                                                         \
+-    ({                                                                  \
 -       int32x4_t c_ = (c);                                              \
-+       int16x8_t c_ = (c);                                              \
 +       int16x8_t b_ = (b);                                              \
         int32x4_t a_ = (a);                                              \
         int32x4_t result;                                                \
@@ -13027,9 +14373,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -       uint16x8_t a_ = (a);                                             \
 -       uint16x8_t result;                                               \
 -       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcopyq_lane_u32(a, b, c, d)                                     \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32x4_t c_ = (c);                                             \
 +       uint16x8_t b_ = (b);                                             \
-+       uint32x4_t a_ = (a);                                             \
-+       uint32x4_t result;                                               \
+        uint32x4_t a_ = (a);                                             \
+        uint32x4_t result;                                               \
+-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
 +       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
@@ -13038,17 +14395,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vcopyq_lane_u32(a, b, c, d)                                     \
+-#define vcopyq_lane_u64(a, b, c, d)                                     \
 +#define vmlal_high_laneq_u32(a, b, c, d)                                \
    __extension__                                                         \
      ({                                                                  \
-        uint32x4_t c_ = (c);                                             \
--       uint32x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
+-       uint64x2_t c_ = (c);                                             \
++       uint32x4_t c_ = (c);                                             \
 +       uint32x4_t b_ = (b);                                             \
-+       uint64x2_t a_ = (a);                                             \
-+       uint64x2_t result;                                               \
+        uint64x2_t a_ = (a);                                             \
+        uint64x2_t result;                                               \
+-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
 +       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
@@ -13057,15 +14413,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vcopyq_lane_u64(a, b, c, d)                                     \
+-#define vcvt_n_f32_s32(a, b)                                            \
 -  __extension__                                                         \
 -    ({                                                                  \
--       uint64x2_t c_ = (c);                                             \
--       uint64x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
+-       int32x2_t a_ = (a);                                              \
+-       float32x2_t result;                                              \
+-       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
 -                : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
+-                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
@@ -13081,12 +14436,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvt_n_f32_s32(a, b)                                            \
+-#define vcvt_n_f32_u32(a, b)                                            \
 -  __extension__                                                         \
 -    ({                                                                  \
--       int32x2_t a_ = (a);                                              \
+-       uint32x2_t a_ = (a);                                             \
 -       float32x2_t result;                                              \
--       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
+-       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13104,12 +14459,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvt_n_f32_u32(a, b)                                            \
+-#define vcvt_n_s32_f32(a, b)                                            \
 -  __extension__                                                         \
 -    ({                                                                  \
--       uint32x2_t a_ = (a);                                             \
--       float32x2_t result;                                              \
--       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
+-       float32x2_t a_ = (a);                                            \
+-       int32x2_t result;                                                \
+-       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13127,12 +14482,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvt_n_s32_f32(a, b)                                            \
+-#define vcvt_n_u32_f32(a, b)                                            \
 -  __extension__                                                         \
 -    ({                                                                  \
 -       float32x2_t a_ = (a);                                            \
--       int32x2_t result;                                                \
--       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
+-       uint32x2_t result;                                               \
+-       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13150,12 +14505,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvt_n_u32_f32(a, b)                                            \
+-#define vcvtd_n_f64_s64(a, b)                                           \
 -  __extension__                                                         \
 -    ({                                                                  \
--       float32x2_t a_ = (a);                                            \
--       uint32x2_t result;                                               \
--       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
+-       int64_t a_ = (a);                                                \
+-       float64_t result;                                                \
+-       __asm__ ("scvtf %d0,%d1,%2"                                      \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13173,12 +14528,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvtd_n_f64_s64(a, b)                                           \
+-#define vcvtd_n_f64_u64(a, b)                                           \
 -  __extension__                                                         \
 -    ({                                                                  \
--       int64_t a_ = (a);                                                \
+-       uint64_t a_ = (a);                                               \
 -       float64_t result;                                                \
--       __asm__ ("scvtf %d0,%d1,%2"                                      \
+-       __asm__ ("ucvtf %d0,%d1,%2"                                      \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13196,12 +14551,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvtd_n_f64_u64(a, b)                                           \
+-#define vcvtd_n_s64_f64(a, b)                                           \
 -  __extension__                                                         \
 -    ({                                                                  \
--       uint64_t a_ = (a);                                               \
--       float64_t result;                                                \
--       __asm__ ("ucvtf %d0,%d1,%2"                                      \
+-       float64_t a_ = (a);                                              \
+-       int64_t result;                                                  \
+-       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13219,12 +14574,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return result;
 +}
  
--#define vcvtd_n_s64_f64(a, b)                                           \
+-#define vcvtd_n_u64_f64(a, b)                                           \
 -  __extension__                                                         \
 -    ({                                                                  \
 -       float64_t a_ = (a);                                              \
--       int64_t result;                                                  \
--       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
+-       uint64_t result;                                                 \
+-       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
 -                : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
 -                : /* No clobbers */);                                   \
@@ -13241,18 +14596,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
- 
--#define vcvtd_n_u64_f64(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       float64_t a_ = (a);                                              \
--       uint64_t result;                                                 \
--       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
++
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
@@ -13410,11 +14754,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    __extension__                                                         \
      ({                                                                  \
 -       float64x2_t a_ = (a);                                            \
+-       uint64x2_t result;                                               \
+-       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
 +       uint32x4_t c_ = (c);                                             \
 +       uint32x2_t b_ = (b);                                             \
 +       uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
++       uint64x2_t result;                                               \
 +       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
                  : "=w"(result)                                          \
 -                : "w"(a_), "i"(b)                                       \
@@ -13537,100 +14882,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
-+{
-+  int32x4_t result;
-+  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
-+{
-+  int64x2_t result;
-+  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+  uint16x8_t result;
-+  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
-+{
-+  uint32x4_t result;
-+  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
-+{
-+  uint64x2_t result;
-+  __asm__ ("umlal %0.2d,%2.2s,%3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
  {
-   float32x4_t result;
+-  float32x4_t result;
 -  __asm__ ("fcvtxn2 %0.4s,%1.2d"
-+  float32x4_t t1;
-+  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
-+           : "=w"(result), "=w"(t1)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
-+{
-+  int16x8_t result;
-+  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
++  int32x4_t result;
++  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
             : "=w"(result)
 -           : "w" (b), "0"(a)
-+           : "0"(a), "w"(b), "x"(c)
++           : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vcvtxd_f32_f64 (float64_t a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
++vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
  {
 -  float32_t result;
 -  __asm__ ("fcvtxn %s0,%d1"
-+  int32x4_t result;
-+  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
++  int64x2_t result;
++  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
             : "=w"(result)
 -           : "w"(a)
 +           : "0"(a), "w"(b), "w"(c)
@@ -13642,108 +14915,50 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
-+{
++vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+ {
+-  float32x2_t result;
+-  float32x2_t t1;
+-  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
 +  uint16x8_t result;
-+  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
++  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
 +           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
++           : "0"(a), "w"(b), "w"(c)
 +           : /* No clobbers */);
 +  return result;
 +}
 +
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
++vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
 +{
 +  uint32x4_t result;
-+  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
-+{
-+  int8x16_t result;
-+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
-+{
-+  int16x8_t result;
-+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
-+{
-+  int32x4_t result;
-+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
-+{
-+  uint8x16_t result;
-+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
++  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
 +           : "=w"(result)
 +           : "0"(a), "w"(b), "w"(c)
 +           : /* No clobbers */);
 +  return result;
 +}
 +
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
++vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
 +{
-+  uint16x8_t result;
-+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
++  uint64x2_t result;
++  __asm__ ("umlal %0.2d,%2.2s,%3.2s"
 +           : "=w"(result)
 +           : "0"(a), "w"(b), "w"(c)
 +           : /* No clobbers */);
 +  return result;
 +}
 +
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
++vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
 +{
-+  uint32x4_t result;
-+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
- {
-   float32x2_t result;
-   float32x2_t t1;
--  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
-+  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
++  float32x4_t result;
++  float32x4_t t1;
++  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
             : "=w"(result), "=w"(t1)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13752,13 +14967,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
++vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
  {
-   int16x4_t result;
+-  int16x4_t result;
 -  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-+  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
++  int16x8_t result;
++  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
             : "=w"(result)
             : "0"(a), "w"(b), "x"(c)
             : /* No clobbers */);
@@ -13767,13 +14983,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
++vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
  {
-   int32x2_t result;
+-  int32x2_t result;
 -  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-+  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
++  int32x4_t result;
++  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13782,13 +14999,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
++vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
  {
-   uint16x4_t result;
+-  uint16x4_t result;
 -  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-+  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
++  uint16x8_t result;
++  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
             : "=w"(result)
             : "0"(a), "w"(b), "x"(c)
             : /* No clobbers */);
@@ -13797,13 +15015,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
++vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
  {
-   uint32x2_t result;
+-  uint32x2_t result;
 -  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-+  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
++  uint32x4_t result;
++  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13812,13 +15031,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
++vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
  {
-   int8x8_t result;
+-  int8x8_t result;
 -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-+  __asm__ ("mls %0.8b,%2.8b,%3.8b"
++  int8x16_t result;
++  __asm__ ("mla %0.16b, %2.16b, %3.16b"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13827,13 +15047,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
++vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
  {
-   int16x4_t result;
+-  int16x4_t result;
 -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-+  __asm__ ("mls %0.4h,%2.4h,%3.4h"
++  int16x8_t result;
++  __asm__ ("mla %0.8h, %2.8h, %3.8h"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13842,13 +15063,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
++vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
  {
-   int32x2_t result;
+-  int32x2_t result;
 -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-+  __asm__ ("mls %0.2s,%2.2s,%3.2s"
++  int32x4_t result;
++  __asm__ ("mla %0.4s, %2.4s, %3.4s"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13857,13 +15079,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
++vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
  {
-   uint8x8_t result;
+-  uint8x8_t result;
 -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-+  __asm__ ("mls %0.8b,%2.8b,%3.8b"
++  uint8x16_t result;
++  __asm__ ("mla %0.16b, %2.16b, %3.16b"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13872,13 +15095,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
++vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
  {
-   uint16x4_t result;
+-  uint16x4_t result;
 -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-+  __asm__ ("mls %0.4h,%2.4h,%3.4h"
++  uint16x8_t result;
++  __asm__ ("mla %0.8h, %2.8h, %3.8h"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13887,13 +15111,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
++vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
  {
-   uint32x2_t result;
+-  uint32x2_t result;
 -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-+  __asm__ ("mls %0.2s,%2.2s,%3.2s"
++  uint32x4_t result;
++  __asm__ ("mla %0.4s, %2.4s, %3.4s"
             : "=w"(result)
             : "0"(a), "w"(b), "w"(c)
             : /* No clobbers */);
@@ -13901,74 +15126,244 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -#define vmlal_high_lane_s16(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int16x4_t c_ = (c);                                              \
+-       int16x8_t b_ = (b);                                              \
+-       int32x4_t a_ = (a);                                              \
+-       int32x4_t result;                                                \
+-       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vmlal_high_lane_s32(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int32x2_t c_ = (c);                                              \
+-       int32x4_t b_ = (b);                                              \
+-       int64x2_t a_ = (a);                                              \
+-       int64x2_t result;                                                \
+-       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
++{
++  float32x2_t result;
++  float32x2_t t1;
++  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
++           : "=w"(result), "=w"(t1)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
+ 
+-#define vmlal_high_lane_u16(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint16x4_t c_ = (c);                                             \
+-       uint16x8_t b_ = (b);                                             \
+-       uint32x4_t a_ = (a);                                             \
+-       uint32x4_t result;                                               \
+-       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
++{
++  int16x4_t result;
++  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
++           : "=w"(result)
++           : "0"(a), "w"(b), "x"(c)
++           : /* No clobbers */);
++  return result;
++}
+ 
+-#define vmlal_high_lane_u32(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32x2_t c_ = (c);                                             \
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
++{
++  int32x2_t result;
++  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
++{
++  uint16x4_t result;
++  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
++           : "=w"(result)
++           : "0"(a), "w"(b), "x"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
++{
++  uint32x2_t result;
++  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
++{
++  int8x8_t result;
++  __asm__ ("mls %0.8b,%2.8b,%3.8b"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
++{
++  int16x4_t result;
++  __asm__ ("mls %0.4h,%2.4h,%3.4h"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
++{
++  int32x2_t result;
++  __asm__ ("mls %0.2s,%2.2s,%3.2s"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
++{
++  uint8x8_t result;
++  __asm__ ("mls %0.8b,%2.8b,%3.8b"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
++{
++  uint16x4_t result;
++  __asm__ ("mls %0.4h,%2.4h,%3.4h"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
++{
++  uint32x2_t result;
++  __asm__ ("mls %0.2s,%2.2s,%3.2s"
++           : "=w"(result)
++           : "0"(a), "w"(b), "w"(c)
++           : /* No clobbers */);
++  return result;
++}
++
 +#define vmlsl_high_lane_s16(a, b, c, d)                                 \
++  __extension__                                                         \
++    ({                                                                  \
++       int16x4_t c_ = (c);                                              \
++       int16x8_t b_ = (b);                                              \
++       int32x4_t a_ = (a);                                              \
++       int32x4_t result;                                                \
++       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
++                : "=w"(result)                                          \
++                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
++#define vmlsl_high_lane_s32(a, b, c, d)                                 \
++  __extension__                                                         \
++    ({                                                                  \
++       int32x2_t c_ = (c);                                              \
++       int32x4_t b_ = (b);                                              \
++       int64x2_t a_ = (a);                                              \
++       int64x2_t result;                                                \
++       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
++                : "=w"(result)                                          \
++                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
++#define vmlsl_high_lane_u16(a, b, c, d)                                 \
++  __extension__                                                         \
++    ({                                                                  \
++       uint16x4_t c_ = (c);                                             \
++       uint16x8_t b_ = (b);                                             \
++       uint32x4_t a_ = (a);                                             \
++       uint32x4_t result;                                               \
++       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
++                : "=w"(result)                                          \
++                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
++#define vmlsl_high_lane_u32(a, b, c, d)                                 \
++  __extension__                                                         \
++    ({                                                                  \
++       uint32x2_t c_ = (c);                                             \
+        uint32x4_t b_ = (b);                                             \
+        uint64x2_t a_ = (a);                                             \
+        uint64x2_t result;                                               \
+-       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
++       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+                 : "=w"(result)                                          \
+                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                 : /* No clobbers */);                                   \
+        result;                                                          \
+      })
+ 
+-#define vmlal_high_laneq_s16(a, b, c, d)                                \
++#define vmlsl_high_laneq_s16(a, b, c, d)                                \
    __extension__                                                         \
      ({                                                                  \
-        int16x4_t c_ = (c);                                              \
-        int16x8_t b_ = (b);                                              \
-        int32x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-+       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_lane_s32(a, b, c, d)                                 \
-+#define vmlsl_high_lane_s32(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
-        int32x2_t c_ = (c);                                              \
-        int32x4_t b_ = (b);                                              \
-        int64x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-+       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_lane_u16(a, b, c, d)                                 \
-+#define vmlsl_high_lane_u16(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
-        uint16x4_t c_ = (c);                                             \
-        uint16x8_t b_ = (b);                                             \
-        uint32x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-+       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_lane_u32(a, b, c, d)                                 \
-+#define vmlsl_high_lane_u32(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
-        uint32x2_t c_ = (c);                                             \
-        uint32x4_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-+       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_laneq_s16(a, b, c, d)                                \
-+#define vmlsl_high_laneq_s16(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        int16x8_t c_ = (c);                                              \
+        int16x8_t c_ = (c);                                              \
         int16x8_t b_ = (b);                                              \
         int32x4_t a_ = (a);                                              \
         int32x4_t result;                                                \
@@ -14812,6 +16207,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -#define vmlsl_high_lane_s16(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int16x4_t c_ = (c);                                              \
+-       int16x8_t b_ = (b);                                              \
+-       int32x4_t a_ = (a);                                              \
+-       int32x4_t result;                                                \
+-       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovl_u32 (uint32x2_t a)
@@ -14823,7 +16230,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
-+
+ 
+-#define vmlsl_high_lane_s32(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int32x2_t c_ = (c);                                              \
+-       int32x4_t b_ = (b);                                              \
+-       int64x2_t a_ = (a);                                              \
+-       int64x2_t result;                                                \
+-       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_high_s16 (int8x8_t a, int16x8_t b)
@@ -14835,7 +16255,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
-+
+ 
+-#define vmlsl_high_lane_u16(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint16x4_t c_ = (c);                                             \
+-       uint16x8_t b_ = (b);                                             \
+-       uint32x4_t a_ = (a);                                             \
+-       uint32x4_t result;                                               \
+-       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_high_s32 (int16x4_t a, int32x4_t b)
@@ -14847,7 +16280,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
-+
+ 
+-#define vmlsl_high_lane_u32(a, b, c, d)                                 \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32x2_t c_ = (c);                                             \
+-       uint32x4_t b_ = (b);                                             \
+-       uint64x2_t a_ = (a);                                             \
+-       uint64x2_t result;                                               \
+-       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_high_s64 (int32x2_t a, int64x2_t b)
@@ -14859,7 +16305,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
-+
+ 
+-#define vmlsl_high_laneq_s16(a, b, c, d)                                \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int16x8_t c_ = (c);                                              \
+-       int16x8_t b_ = (b);                                              \
+-       int32x4_t a_ = (a);                                              \
+-       int32x4_t result;                                                \
+-       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
@@ -14871,113 +16330,219 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +           : /* No clobbers */);
 +  return result;
 +}
-+
+ 
+-#define vmlsl_high_laneq_s32(a, b, c, d)                                \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int32x4_t c_ = (c);                                              \
+-       int32x4_t b_ = (b);                                              \
+-       int64x2_t a_ = (a);                                              \
+-       int64x2_t result;                                                \
+-       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vmlsl_high_laneq_u16(a, b, c, d)                                \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint16x8_t c_ = (c);                                             \
+-       uint16x8_t b_ = (b);                                             \
+-       uint32x4_t a_ = (a);                                             \
+-       uint32x4_t result;                                               \
+-       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vmlsl_high_laneq_u32(a, b, c, d)                                \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32x4_t c_ = (c);                                             \
+-       uint32x4_t b_ = (b);                                             \
+-       uint64x2_t a_ = (a);                                             \
+-       uint64x2_t result;                                               \
+-       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-                : "=w"(result)                                          \
+-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
+-{
+-  int32x4_t result;
+-  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
+-{
+-  int64x2_t result;
+-  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
-+{
+ {
+-  uint32x4_t result;
+-  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
 +  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
 +  __asm__ ("xtn2 %0.8h,%1.4s"
 +           : "+w"(result)
 +           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
-+{
+ {
+-  uint64x2_t result;
+-  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
 +  __asm__ ("xtn2 %0.4s,%1.2d"
 +           : "+w"(result)
 +           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_s16 (int16x8_t a)
-+{
+ {
+-  int16x8_t result;
+-  __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
 +  int8x8_t result;
 +  __asm__ ("xtn %0.8b,%1.8h"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_s32 (int32x4_t a)
-+{
+ {
+-  int32x4_t result;
+-  __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
 +  int16x4_t result;
 +  __asm__ ("xtn %0.4h,%1.4s"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_s64 (int64x2_t a)
-+{
+ {
+-  int64x2_t result;
+-  __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
 +  int32x2_t result;
 +  __asm__ ("xtn %0.2s,%1.2d"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_u16 (uint16x8_t a)
-+{
+ {
+-  uint16x8_t result;
+-  __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
 +  uint8x8_t result;
 +  __asm__ ("xtn %0.8b,%1.8h"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_u32 (uint32x4_t a)
-+{
+ {
+-  uint32x4_t result;
+-  __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
 +  uint16x4_t result;
 +  __asm__ ("xtn %0.4h,%1.4s"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmovn_u64 (uint64x2_t a)
-+{
+ {
+-  uint64x2_t result;
+-  __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
 +  uint32x2_t result;
 +  __asm__ ("xtn %0.2s,%1.2d"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-#define vmlsl_lane_s16(a, b, c, d)                                      \
 +#define vmull_high_lane_s16(a, b, c)                                    \
    __extension__                                                         \
      ({                                                                  \
 -       int16x4_t c_ = (c);                                              \
--       int16x8_t b_ = (b);                                              \
+        int16x4_t b_ = (b);                                              \
 -       int32x4_t a_ = (a);                                              \
-+       int16x4_t b_ = (b);                                              \
 +       int16x8_t a_ = (a);                                              \
         int32x4_t result;                                                \
--       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
 +       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
@@ -14986,17 +16551,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_lane_s32(a, b, c, d)                                 \
+-#define vmlsl_lane_s32(a, b, c, d)                                      \
 +#define vmull_high_lane_s32(a, b, c)                                    \
    __extension__                                                         \
      ({                                                                  \
 -       int32x2_t c_ = (c);                                              \
--       int32x4_t b_ = (b);                                              \
+        int32x2_t b_ = (b);                                              \
 -       int64x2_t a_ = (a);                                              \
-+       int32x2_t b_ = (b);                                              \
 +       int32x4_t a_ = (a);                                              \
         int64x2_t result;                                                \
--       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
 +       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
@@ -15005,17 +16569,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_lane_u16(a, b, c, d)                                 \
+-#define vmlsl_lane_u16(a, b, c, d)                                      \
 +#define vmull_high_lane_u16(a, b, c)                                    \
    __extension__                                                         \
      ({                                                                  \
 -       uint16x4_t c_ = (c);                                             \
--       uint16x8_t b_ = (b);                                             \
+        uint16x4_t b_ = (b);                                             \
 -       uint32x4_t a_ = (a);                                             \
-+       uint16x4_t b_ = (b);                                             \
 +       uint16x8_t a_ = (a);                                             \
         uint32x4_t result;                                               \
--       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
 +       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
@@ -15024,17 +16587,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_lane_u32(a, b, c, d)                                 \
+-#define vmlsl_lane_u32(a, b, c, d)                                      \
 +#define vmull_high_lane_u32(a, b, c)                                    \
    __extension__                                                         \
      ({                                                                  \
 -       uint32x2_t c_ = (c);                                             \
--       uint32x4_t b_ = (b);                                             \
+        uint32x2_t b_ = (b);                                             \
 -       uint64x2_t a_ = (a);                                             \
-+       uint32x2_t b_ = (b);                                             \
 +       uint32x4_t a_ = (a);                                             \
         uint64x2_t result;                                               \
--       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
 +       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
@@ -15043,16 +16605,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_laneq_s16(a, b, c, d)                                \
+-#define vmlsl_laneq_s16(a, b, c, d)                                     \
 +#define vmull_high_laneq_s16(a, b, c)                                   \
    __extension__                                                         \
      ({                                                                  \
 -       int16x8_t c_ = (c);                                              \
-        int16x8_t b_ = (b);                                              \
+-       int16x4_t b_ = (b);                                              \
 -       int32x4_t a_ = (a);                                              \
++       int16x8_t b_ = (b);                                              \
 +       int16x8_t a_ = (a);                                              \
         int32x4_t result;                                                \
--       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
 +       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
@@ -15061,16 +16624,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_laneq_s32(a, b, c, d)                                \
+-#define vmlsl_laneq_s32(a, b, c, d)                                     \
 +#define vmull_high_laneq_s32(a, b, c)                                   \
    __extension__                                                         \
      ({                                                                  \
 -       int32x4_t c_ = (c);                                              \
-        int32x4_t b_ = (b);                                              \
+-       int32x2_t b_ = (b);                                              \
 -       int64x2_t a_ = (a);                                              \
++       int32x4_t b_ = (b);                                              \
 +       int32x4_t a_ = (a);                                              \
         int64x2_t result;                                                \
--       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
 +       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
@@ -15079,16 +16643,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_laneq_u16(a, b, c, d)                                \
+-#define vmlsl_laneq_u16(a, b, c, d)                                     \
 +#define vmull_high_laneq_u16(a, b, c)                                   \
    __extension__                                                         \
      ({                                                                  \
 -       uint16x8_t c_ = (c);                                             \
-        uint16x8_t b_ = (b);                                             \
+-       uint16x4_t b_ = (b);                                             \
 -       uint32x4_t a_ = (a);                                             \
++       uint16x8_t b_ = (b);                                             \
 +       uint16x8_t a_ = (a);                                             \
         uint32x4_t result;                                               \
--       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
+-       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
 +       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
@@ -15097,16 +16662,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmlsl_high_laneq_u32(a, b, c, d)                                \
+-#define vmlsl_laneq_u32(a, b, c, d)                                     \
 +#define vmull_high_laneq_u32(a, b, c)                                   \
    __extension__                                                         \
      ({                                                                  \
 -       uint32x4_t c_ = (c);                                             \
-        uint32x4_t b_ = (b);                                             \
+-       uint32x2_t b_ = (b);                                             \
 -       uint64x2_t a_ = (a);                                             \
++       uint32x4_t b_ = (b);                                             \
 +       uint32x4_t a_ = (a);                                             \
         uint64x2_t result;                                               \
--       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
+-       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
 +       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
                  : "=w"(result)                                          \
 -                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
@@ -15116,13 +16682,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
       })
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
+-vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_n_s16 (int16x8_t a, int16_t b)
  {
    int32x4_t result;
--  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
+-  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
 +  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "x"(c)
@@ -15132,13 +16698,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
+-vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_n_s32 (int32x4_t a, int32_t b)
  {
    int64x2_t result;
--  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
+-  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
 +  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15148,13 +16714,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
+-vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_n_u16 (uint16x8_t a, uint16_t b)
  {
    uint32x4_t result;
--  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
+-  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
 +  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "x"(c)
@@ -15164,13 +16730,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
+-vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_n_u32 (uint32x4_t a, uint32_t b)
  {
    uint64x2_t result;
--  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
+-  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
 +  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15180,13 +16746,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+-vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_p8 (poly8x16_t a, poly8x16_t b)
  {
 -  int16x8_t result;
--  __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
+-  __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
 +  poly16x8_t result;
 +  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
             : "=w"(result)
@@ -15197,13 +16763,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+-vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_s8 (int8x16_t a, int8x16_t b)
  {
 -  int32x4_t result;
--  __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
+-  __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
 +  int16x8_t result;
 +  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
             : "=w"(result)
@@ -15214,43 +16780,48 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+-vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_s16 (int16x8_t a, int16x8_t b)
  {
 -  int64x2_t result;
--  __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
--           : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-  __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
 +  int32x4_t result;
 +  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+-vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_s32 (int32x4_t a, int32x4_t b)
-+{
+ {
+-  uint16x8_t result;
+-  __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
 +  int64x2_t result;
 +  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_u8 (uint8x16_t a, uint8x16_t b)
  {
-   uint16x8_t result;
--  __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
+-  uint32x4_t result;
+-  __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
++  uint16x8_t result;
 +  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15259,14 +16830,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_u16 (uint16x8_t a, uint16x8_t b)
  {
-   uint32x4_t result;
--  __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
+-  uint64x2_t result;
+-  __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
++  uint32x4_t result;
 +  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15275,178 +16847,138 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_high_u32 (uint32x4_t a, uint32x4_t b)
  {
-   uint64x2_t result;
--  __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
-+  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-            : "=w"(result)
+-  float32x4_t result;
+-  float32x4_t t1;
+-  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
+-           : "=w"(result), "=w"(t1)
 -           : "0"(a), "w"(b), "w"(c)
++  uint64x2_t result;
++  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
++           : "=w"(result)
 +           : "w"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--#define vmlsl_lane_s16(a, b, c, d)                                      \
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
 +#define vmull_lane_s16(a, b, c)                                         \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x4_t c_ = (c);                                              \
-        int16x4_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
++  __extension__                                                         \
++    ({                                                                  \
++       int16x4_t b_ = (b);                                              \
 +       int16x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
++       int32x4_t result;                                                \
 +       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_lane_s32(a, b, c, d)                                      \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_lane_s32(a, b, c)                                         \
-   __extension__                                                         \
-     ({                                                                  \
--       int32x2_t c_ = (c);                                              \
-        int32x2_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
++  __extension__                                                         \
++    ({                                                                  \
++       int32x2_t b_ = (b);                                              \
 +       int32x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
++       int64x2_t result;                                                \
 +       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_lane_u16(a, b, c, d)                                      \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_lane_u16(a, b, c)                                         \
-   __extension__                                                         \
-     ({                                                                  \
--       uint16x4_t c_ = (c);                                             \
-        uint16x4_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
++  __extension__                                                         \
++    ({                                                                  \
++       uint16x4_t b_ = (b);                                             \
 +       uint16x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
++       uint32x4_t result;                                               \
 +       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_lane_u32(a, b, c, d)                                      \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_lane_u32(a, b, c)                                         \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x2_t c_ = (c);                                             \
-        uint32x2_t b_ = (b);                                             \
--       uint64x2_t a_ = (a);                                             \
++  __extension__                                                         \
++    ({                                                                  \
++       uint32x2_t b_ = (b);                                             \
 +       uint32x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
++       uint64x2_t result;                                               \
 +       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_s16(a, b, c, d)                                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_laneq_s16(a, b, c)                                        \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x8_t c_ = (c);                                              \
--       int16x4_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
++  __extension__                                                         \
++    ({                                                                  \
 +       int16x8_t b_ = (b);                                              \
 +       int16x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
++       int32x4_t result;                                                \
 +       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_s32(a, b, c, d)                                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_laneq_s32(a, b, c)                                        \
-   __extension__                                                         \
-     ({                                                                  \
--       int32x4_t c_ = (c);                                              \
--       int32x2_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
++  __extension__                                                         \
++    ({                                                                  \
 +       int32x4_t b_ = (b);                                              \
 +       int32x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
++       int64x2_t result;                                                \
 +       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_u16(a, b, c, d)                                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_laneq_u16(a, b, c)                                        \
-   __extension__                                                         \
-     ({                                                                  \
--       uint16x8_t c_ = (c);                                             \
--       uint16x4_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
++  __extension__                                                         \
++    ({                                                                  \
 +       uint16x8_t b_ = (b);                                             \
 +       uint16x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
++       uint32x4_t result;                                               \
 +       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_u32(a, b, c, d)                                     \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vmull_laneq_u32(a, b, c)                                        \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x4_t c_ = (c);                                             \
--       uint32x2_t b_ = (b);                                             \
--       uint64x2_t a_ = (a);                                             \
++  __extension__                                                         \
++    ({                                                                  \
 +       uint32x4_t b_ = (b);                                             \
 +       uint32x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
++       uint64x2_t result;                                               \
 +       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
++                : "=w"(result)                                          \
 +                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_n_s16 (int16x4_t a, int16_t b)
  {
-   int32x4_t result;
--  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
+-  int16x8_t result;
+-  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
++  int32x4_t result;
 +  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "x"(c)
@@ -15455,14 +16987,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_n_s32 (int32x2_t a, int32_t b)
  {
-   int64x2_t result;
--  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
+-  int32x4_t result;
+-  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
++  int64x2_t result;
 +  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15471,14 +17004,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_n_u16 (uint16x4_t a, uint16_t b)
  {
-   uint32x4_t result;
--  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
+-  uint16x8_t result;
+-  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
++  uint32x4_t result;
 +  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "x"(c)
@@ -15487,14 +17021,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_n_u32 (uint32x2_t a, uint32_t b)
  {
-   uint64x2_t result;
--  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
+-  uint32x4_t result;
+-  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
++  uint64x2_t result;
 +  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15503,26 +17038,31 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_p8 (poly8x8_t a, poly8x8_t b)
-+{
+ {
+-  int8x16_t result;
+-  __asm__ ("mls %0.16b,%2.16b,%3.16b"
 +  poly16x8_t result;
 +  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
 +           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_s8 (int8x8_t a, int8x8_t b)
  {
    int16x8_t result;
--  __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
+-  __asm__ ("mls %0.8h,%2.8h,%3.8h"
 +  __asm__ ("smull %0.8h, %1.8b, %2.8b"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15532,13 +17072,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+-vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_s16 (int16x4_t a, int16x4_t b)
  {
    int32x4_t result;
--  __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
+-  __asm__ ("mls %0.4s,%2.4s,%3.4s"
 +  __asm__ ("smull %0.4s, %1.4h, %2.4h"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15547,14 +17087,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_s32 (int32x2_t a, int32x2_t b)
  {
-   int64x2_t result;
--  __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
+-  uint8x16_t result;
+-  __asm__ ("mls %0.16b,%2.16b,%3.16b"
++  int64x2_t result;
 +  __asm__ ("smull %0.2d, %1.2s, %2.2s"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15564,13 +17105,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+-vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_u8 (uint8x8_t a, uint8x8_t b)
  {
    uint16x8_t result;
--  __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
+-  __asm__ ("mls %0.8h,%2.8h,%3.8h"
 +  __asm__ ("umull %0.8h, %1.8b, %2.8b"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15580,13 +17121,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+-vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_u16 (uint16x4_t a, uint16x4_t b)
  {
    uint32x4_t result;
--  __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
+-  __asm__ ("mls %0.4s,%2.4s,%3.4s"
 +  __asm__ ("umull %0.4s, %1.4h, %2.4h"
             : "=w"(result)
 -           : "0"(a), "w"(b), "w"(c)
@@ -15595,323 +17136,333 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmovl_high_s8 (int8x16_t a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmull_u32 (uint32x2_t a, uint32x2_t b)
  {
-   uint64x2_t result;
--  __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
+-  int16x8_t result;
+-  __asm__ ("sshll2 %0.8h,%1.16b,#0"
++  uint64x2_t result;
 +  __asm__ ("umull %0.2d, %1.2s, %2.2s"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "w"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmovl_high_s16 (int16x8_t a)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadal_s8 (int16x4_t a, int8x8_t b)
  {
--  float32x4_t result;
--  float32x4_t t1;
--  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
--           : "=w"(result), "=w"(t1)
--           : "0"(a), "w"(b), "w"(c)
+-  int32x4_t result;
+-  __asm__ ("sshll2 %0.4s,%1.8h,#0"
 +  int16x4_t result;
 +  __asm__ ("sadalp %0.4h,%2.8b"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vmovl_high_s32 (int32x4_t a)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadal_s16 (int32x2_t a, int16x4_t b)
  {
--  int16x8_t result;
--  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
+-  int64x2_t result;
+-  __asm__ ("sshll2 %0.2d,%1.4s,#0"
 +  int32x2_t result;
 +  __asm__ ("sadalp %0.2s,%2.4h"
             : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmovl_high_u8 (uint8x16_t a)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadal_s32 (int64x1_t a, int32x2_t b)
  {
--  int32x4_t result;
--  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
+-  uint16x8_t result;
+-  __asm__ ("ushll2 %0.8h,%1.16b,#0"
 +  int64x1_t result;
 +  __asm__ ("sadalp %0.1d,%2.2s"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmovl_high_u16 (uint16x8_t a)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadal_u8 (uint16x4_t a, uint8x8_t b)
  {
--  uint16x8_t result;
--  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
+-  uint32x4_t result;
+-  __asm__ ("ushll2 %0.4s,%1.8h,#0"
 +  uint16x4_t result;
 +  __asm__ ("uadalp %0.4h,%2.8b"
             : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmovl_high_u32 (uint32x4_t a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadal_u16 (uint32x2_t a, uint16x4_t b)
  {
--  uint32x4_t result;
--  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
+-  uint64x2_t result;
+-  __asm__ ("ushll2 %0.2d,%1.4s,#0"
 +  uint32x2_t result;
 +  __asm__ ("uadalp %0.2s,%2.4h"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmovl_s8 (int8x8_t a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadal_u32 (uint64x1_t a, uint32x2_t b)
- {
--  int8x16_t result;
--  __asm__ ("mls %0.16b,%2.16b,%3.16b"
++{
 +  uint64x1_t result;
 +  __asm__ ("uadalp %0.1d,%2.2s"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
++           : "=w"(result)
 +           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadalq_s8 (int16x8_t a, int8x16_t b)
  {
    int16x8_t result;
--  __asm__ ("mls %0.8h,%2.8h,%3.8h"
+-  __asm__ ("sshll %0.8h,%1.8b,#0"
 +  __asm__ ("sadalp %0.8h,%2.16b"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+-vmovl_s16 (int16x4_t a)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadalq_s16 (int32x4_t a, int16x8_t b)
  {
    int32x4_t result;
--  __asm__ ("mls %0.4s,%2.4s,%3.4s"
+-  __asm__ ("sshll %0.4s,%1.4h,#0"
 +  __asm__ ("sadalp %0.4s,%2.8h"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vmovl_s32 (int32x2_t a)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadalq_s32 (int64x2_t a, int32x4_t b)
  {
--  uint8x16_t result;
--  __asm__ ("mls %0.16b,%2.16b,%3.16b"
-+  int64x2_t result;
+   int64x2_t result;
+-  __asm__ ("sshll %0.2d,%1.2s,#0"
 +  __asm__ ("sadalp %0.2d,%2.4s"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+-vmovl_u8 (uint8x8_t a)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadalq_u8 (uint16x8_t a, uint8x16_t b)
  {
    uint16x8_t result;
--  __asm__ ("mls %0.8h,%2.8h,%3.8h"
+-  __asm__ ("ushll %0.8h,%1.8b,#0"
 +  __asm__ ("uadalp %0.8h,%2.16b"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+-vmovl_u16 (uint16x4_t a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadalq_u16 (uint32x4_t a, uint16x8_t b)
  {
    uint32x4_t result;
--  __asm__ ("mls %0.4s,%2.4s,%3.4s"
+-  __asm__ ("ushll %0.4s,%1.4h,#0"
 +  __asm__ ("uadalp %0.4s,%2.8h"
             : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovl_high_s8 (int8x16_t a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmovl_u32 (uint32x2_t a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpadalq_u32 (uint64x2_t a, uint32x4_t b)
  {
--  int16x8_t result;
--  __asm__ ("sshll2 %0.8h,%1.16b,#0"
-+  uint64x2_t result;
+   uint64x2_t result;
+-  __asm__ ("ushll %0.2d,%1.2s,#0"
 +  __asm__ ("uadalp %0.2d,%2.4s"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "w"(a)
 +           : "0"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vmovn_high_s16 (int8x8_t a, int16x8_t b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddl_s8 (int8x8_t a)
-+{
+ {
+-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+-  __asm__ ("xtn2 %0.16b,%1.8h"
+-           : "+w"(result)
+-           : "w"(b)
 +  int16x4_t result;
 +  __asm__ ("saddlp %0.4h,%1.8b"
-            : "=w"(result)
-            : "w"(a)
++           : "=w"(result)
++           : "w"(a)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovl_high_s16 (int16x8_t a)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmovn_high_s32 (int16x4_t a, int32x4_t b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddl_s16 (int16x4_t a)
  {
--  int32x4_t result;
--  __asm__ ("sshll2 %0.4s,%1.8h,#0"
+-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+-  __asm__ ("xtn2 %0.8h,%1.4s"
+-           : "+w"(result)
+-           : "w"(b)
 +  int32x2_t result;
 +  __asm__ ("saddlp %0.2s,%1.4h"
-            : "=w"(result)
-            : "w"(a)
++           : "=w"(result)
++           : "w"(a)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmovl_high_s32 (int32x4_t a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmovn_high_s64 (int32x2_t a, int64x2_t b)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddl_s32 (int32x2_t a)
  {
--  int64x2_t result;
--  __asm__ ("sshll2 %0.2d,%1.4s,#0"
+-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+-  __asm__ ("xtn2 %0.4s,%1.2d"
+-           : "+w"(result)
+-           : "w"(b)
 +  int64x1_t result;
 +  __asm__ ("saddlp %0.1d,%1.2s"
-            : "=w"(result)
-            : "w"(a)
++           : "=w"(result)
++           : "w"(a)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovl_high_u8 (uint8x16_t a)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddl_u8 (uint8x8_t a)
  {
--  uint16x8_t result;
--  __asm__ ("ushll2 %0.8h,%1.16b,#0"
+-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+-  __asm__ ("xtn2 %0.16b,%1.8h"
+-           : "+w"(result)
+-           : "w"(b)
 +  uint16x4_t result;
 +  __asm__ ("uaddlp %0.4h,%1.8b"
-            : "=w"(result)
-            : "w"(a)
++           : "=w"(result)
++           : "w"(a)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovl_high_u16 (uint16x8_t a)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddl_u16 (uint16x4_t a)
  {
--  uint32x4_t result;
--  __asm__ ("ushll2 %0.4s,%1.8h,#0"
+-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+-  __asm__ ("xtn2 %0.8h,%1.4s"
+-           : "+w"(result)
+-           : "w"(b)
 +  uint32x2_t result;
 +  __asm__ ("uaddlp %0.2s,%1.4h"
-            : "=w"(result)
-            : "w"(a)
++           : "=w"(result)
++           : "w"(a)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmovl_high_u32 (uint32x4_t a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddl_u32 (uint32x2_t a)
  {
--  uint64x2_t result;
--  __asm__ ("ushll2 %0.2d,%1.4s,#0"
+-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+-  __asm__ ("xtn2 %0.4s,%1.2d"
+-           : "+w"(result)
+-           : "w"(b)
 +  uint64x1_t result;
 +  __asm__ ("uaddlp %0.1d,%1.2s"
-            : "=w"(result)
-            : "w"(a)
++           : "=w"(result)
++           : "w"(a)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovl_s8 (int8x8_t a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vmovn_s16 (int16x8_t a)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddlq_s8 (int8x16_t a)
  {
-   int16x8_t result;
--  __asm__ ("sshll %0.8h,%1.8b,#0"
+-  int8x8_t result;
+-  __asm__ ("xtn %0.8b,%1.8h"
++  int16x8_t result;
 +  __asm__ ("saddlp %0.8h,%1.16b"
             : "=w"(result)
             : "w"(a)
@@ -15919,14 +17470,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovl_s16 (int16x4_t a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vmovn_s32 (int32x4_t a)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddlq_s16 (int16x8_t a)
  {
-   int32x4_t result;
--  __asm__ ("sshll %0.4s,%1.4h,#0"
+-  int16x4_t result;
+-  __asm__ ("xtn %0.4h,%1.4s"
++  int32x4_t result;
 +  __asm__ ("saddlp %0.4s,%1.8h"
             : "=w"(result)
             : "w"(a)
@@ -15934,14 +17486,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmovl_s32 (int32x2_t a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vmovn_s64 (int64x2_t a)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddlq_s32 (int32x4_t a)
  {
-   int64x2_t result;
--  __asm__ ("sshll %0.2d,%1.2s,#0"
+-  int32x2_t result;
+-  __asm__ ("xtn %0.2s,%1.2d"
++  int64x2_t result;
 +  __asm__ ("saddlp %0.2d,%1.4s"
             : "=w"(result)
             : "w"(a)
@@ -15949,14 +17502,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovl_u8 (uint8x8_t a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vmovn_u16 (uint16x8_t a)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddlq_u8 (uint8x16_t a)
  {
-   uint16x8_t result;
--  __asm__ ("ushll %0.8h,%1.8b,#0"
+-  uint8x8_t result;
+-  __asm__ ("xtn %0.8b,%1.8h"
++  uint16x8_t result;
 +  __asm__ ("uaddlp %0.8h,%1.16b"
             : "=w"(result)
             : "w"(a)
@@ -15964,14 +17518,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovl_u16 (uint16x4_t a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vmovn_u32 (uint32x4_t a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddlq_u16 (uint16x8_t a)
  {
-   uint32x4_t result;
--  __asm__ ("ushll %0.4s,%1.4h,#0"
+-  uint16x4_t result;
+-  __asm__ ("xtn %0.4h,%1.4s"
++  uint32x4_t result;
 +  __asm__ ("uaddlp %0.4s,%1.8h"
             : "=w"(result)
             : "w"(a)
@@ -15979,14 +17534,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmovl_u32 (uint32x2_t a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vmovn_u64 (uint64x2_t a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddlq_u32 (uint32x4_t a)
  {
-   uint64x2_t result;
--  __asm__ ("ushll %0.2d,%1.2s,#0"
+-  uint32x2_t result;
+-  __asm__ ("xtn %0.2s,%1.2d"
++  uint64x2_t result;
 +  __asm__ ("uaddlp %0.2d,%1.4s"
             : "=w"(result)
             : "w"(a)
@@ -15994,302 +17550,164 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmovn_high_s16 (int8x8_t a, int16x8_t b)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vmul_n_f32 (float32x2_t a, float32_t b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_s8 (int8x16_t a, int8x16_t b)
  {
--  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.16b,%1.8h"
--           : "+w"(result)
--           : "w"(b)
+-  float32x2_t result;
+-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
 +  int8x16_t result;
 +  __asm__ ("addp %0.16b,%1.16b,%2.16b"
 +           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovn_high_s32 (int16x4_t a, int32x4_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++           : /* No clobbers */);
++  return result;
++}
++
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_s16 (int16x8_t a, int16x8_t b)
- {
--  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.8h,%1.4s"
--           : "+w"(result)
--           : "w"(b)
++{
 +  int16x8_t result;
 +  __asm__ ("addp %0.8h,%1.8h,%2.8h"
 +           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovn_high_s64 (int32x2_t a, int64x2_t b)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_s32 (int32x4_t a, int32x4_t b)
- {
--  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.4s,%1.2d"
--           : "+w"(result)
--           : "w"(b)
++{
 +  int32x4_t result;
 +  __asm__ ("addp %0.4s,%1.4s,%2.4s"
 +           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_s64 (int64x2_t a, int64x2_t b)
- {
--  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.16b,%1.8h"
--           : "+w"(result)
--           : "w"(b)
++{
 +  int64x2_t result;
 +  __asm__ ("addp %0.2d,%1.2d,%2.2d"
 +           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_u8 (uint8x16_t a, uint8x16_t b)
- {
--  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.8h,%1.4s"
--           : "+w"(result)
--           : "w"(b)
++{
 +  uint8x16_t result;
 +  __asm__ ("addp %0.16b,%1.16b,%2.16b"
 +           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_u16 (uint16x8_t a, uint16x8_t b)
- {
--  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.4s,%1.2d"
--           : "+w"(result)
--           : "w"(b)
++{
 +  uint16x8_t result;
 +  __asm__ ("addp %0.8h,%1.8h,%2.8h"
 +           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmovn_s16 (int16x8_t a)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_u32 (uint32x4_t a, uint32x4_t b)
- {
--  int8x8_t result;
--  __asm__ ("xtn %0.8b,%1.8h"
++{
 +  uint32x4_t result;
 +  __asm__ ("addp %0.4s,%1.4s,%2.4s"
-            : "=w"(result)
--           : "w"(a)
++           : "=w"(result)
 +           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmovn_s32 (int32x4_t a)
++           : /* No clobbers */);
++  return result;
++}
++
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vpaddq_u64 (uint64x2_t a, uint64x2_t b)
- {
--  int16x4_t result;
--  __asm__ ("xtn %0.4h,%1.4s"
++{
 +  uint64x2_t result;
 +  __asm__ ("addp %0.2d,%1.2d,%2.2d"
             : "=w"(result)
--           : "w"(a)
-+           : "w"(a), "w"(b)
+            : "w"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmovn_s64 (int64x2_t a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vmul_n_s16 (int16x4_t a, int16_t b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqdmulh_n_s16 (int16x4_t a, int16_t b)
  {
--  int32x2_t result;
--  __asm__ ("xtn %0.2s,%1.2d"
-+  int16x4_t result;
+   int16x4_t result;
+-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
 +  __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
             : "=w"(result)
--           : "w"(a)
-+           : "w"(a), "x"(b)
+            : "w"(a), "x"(b)
             : /* No clobbers */);
    return result;
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmovn_u16 (uint16x8_t a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vmul_n_s32 (int32x2_t a, int32_t b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqdmulh_n_s32 (int32x2_t a, int32_t b)
  {
--  uint8x8_t result;
--  __asm__ ("xtn %0.8b,%1.8h"
-+  int32x2_t result;
+   int32x2_t result;
+-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
 +  __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
             : "=w"(result)
--           : "w"(a)
-+           : "w"(a), "w"(b)
+            : "w"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmovn_u32 (uint32x4_t a)
+-vmul_n_u16 (uint16x4_t a, uint16_t b)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqdmulhq_n_s16 (int16x8_t a, int16_t b)
  {
 -  uint16x4_t result;
--  __asm__ ("xtn %0.4h,%1.4s"
+-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
 +  int16x8_t result;
 +  __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
             : "=w"(result)
--           : "w"(a)
-+           : "w"(a), "x"(b)
+            : "w"(a), "x"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmovn_u64 (uint64x2_t a)
+-vmul_n_u32 (uint32x2_t a, uint32_t b)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqdmulhq_n_s32 (int32x4_t a, int32_t b)
  {
 -  uint32x2_t result;
--  __asm__ ("xtn %0.2s,%1.2d"
+-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
 +  int32x4_t result;
 +  __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
             : "=w"(result)
--           : "w"(a)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmul_n_f32 (float32x2_t a, float32_t b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_s16 (int8x8_t a, int16x8_t b)
- {
--  float32x2_t result;
--  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtn2 %0.16b, %1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmul_n_s16 (int16x4_t a, int16_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_s32 (int16x4_t a, int32x4_t b)
- {
--  int16x4_t result;
--  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
-+  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtn2 %0.8h, %1.4s"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmul_n_s32 (int32x2_t a, int32_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_s64 (int32x2_t a, int64x2_t b)
- {
--  int32x2_t result;
--  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtn2 %0.4s, %1.2d"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmul_n_u16 (uint16x4_t a, uint16_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
- {
--  uint16x4_t result;
--  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
-+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("uqxtn2 %0.16b, %1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmul_n_u32 (uint32x2_t a, uint32_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
- {
--  uint32x2_t result;
--  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("uqxtn2 %0.8h, %1.4s"
-+           : "+w"(result)
-+           : "w"(b)
+            : "w"(a), "w"(b)
             : /* No clobbers */);
    return result;
  }
@@ -16306,12 +17724,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
++vqmovn_high_s16 (int8x8_t a, int16x8_t b)
 +{
-+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("uqxtn2 %0.4s, %1.2d"
++  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("sqxtn2 %0.16b, %1.8h"
 +           : "+w"(result)
 +           : "w"(b)
 +           : /* No clobbers */);
@@ -16408,19 +17826,30 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqmovn_high_s32 (int16x4_t a, int32x4_t b)
++{
++  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("sqxtn2 %0.8h, %1.4s"
++           : "+w"(result)
++           : "w"(b)
++           : /* No clobbers */);
++  return result;
++}
+ 
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmull_high_n_s16 (int16x8_t a, int16_t b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
++vqmovn_high_s64 (int32x2_t a, int64x2_t b)
  {
 -  int32x4_t result;
 -  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
 -           : "=w"(result)
 -           : "w"(a), "x"(b)
-+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtun2 %0.16b, %1.8h"
++  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("sqxtn2 %0.4s, %1.2d"
 +           : "+w"(result)
 +           : "w"(b)
             : /* No clobbers */);
@@ -16429,16 +17858,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vmull_high_n_s32 (int32x4_t a, int32_t b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
++vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
  {
 -  int64x2_t result;
 -  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
-+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtun2 %0.8h, %1.4s"
++  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("uqxtn2 %0.16b, %1.8h"
 +           : "+w"(result)
 +           : "w"(b)
             : /* No clobbers */);
@@ -16447,16 +17876,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
++vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
  {
 -  uint32x4_t result;
 -  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
 -           : "=w"(result)
 -           : "w"(a), "x"(b)
-+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtun2 %0.4s, %1.2d"
++  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("uqxtn2 %0.8h, %1.4s"
 +           : "+w"(result)
 +           : "w"(b)
             : /* No clobbers */);
@@ -16465,114 +17894,147 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_n_s16 (int16x4_t a, int16_t b)
++vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
  {
 -  uint64x2_t result;
 -  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-+  int16x4_t result;
-+  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
-            : "=w"(result)
+-           : "=w"(result)
 -           : "w"(a), "w"(b)
-+           : "w"(a), "x"(b)
++  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("uqxtn2 %0.4s, %1.2d"
++           : "+w"(result)
++           : "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 -vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_n_s32 (int32x2_t a, int32_t b)
++vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
  {
 -  poly16x8_t result;
 -  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-+  int32x2_t result;
-+  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
-            : "=w"(result)
-            : "w"(a), "w"(b)
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("sqxtun2 %0.16b, %1.8h"
++           : "+w"(result)
++           : "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmull_high_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
++vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
  {
-   int16x8_t result;
+-  int16x8_t result;
 -  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-+  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
-            : "=w"(result)
+-           : "=w"(result)
 -           : "w"(a), "w"(b)
-+           : "w"(a), "x"(b)
++  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("sqxtun2 %0.8h, %1.4s"
++           : "+w"(result)
++           : "w"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmull_high_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
++vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
  {
-   int32x4_t result;
+-  int32x4_t result;
 -  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
++  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
++  __asm__ ("sqxtun2 %0.4s, %1.2d"
++           : "+w"(result)
++           : "w"(b)
+            : /* No clobbers */);
+   return result;
+ }
+ 
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vmull_high_s32 (int32x4_t a, int32x4_t b)
--{
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqrdmulh_n_s16 (int16x4_t a, int16_t b)
+ {
 -  int64x2_t result;
 -  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
--           : "=w"(result)
++  int16x4_t result;
++  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
+            : "=w"(result)
 -           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
++           : "w"(a), "x"(b)
+            : /* No clobbers */);
+   return result;
+ }
+ 
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmull_high_u8 (uint8x16_t a, uint8x16_t b)
--{
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqrdmulh_n_s32 (int32x2_t a, int32_t b)
+ {
 -  uint16x8_t result;
 -  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
++  int32x2_t result;
++  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
+            : "=w"(result)
+            : "w"(a), "w"(b)
+            : /* No clobbers */);
+   return result;
+ }
+ 
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmull_high_u16 (uint16x8_t a, uint16x8_t b)
--{
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
+ {
 -  uint32x4_t result;
 -  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-+  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
++  int16x8_t result;
++  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
             : "=w"(result)
-            : "w"(a), "w"(b)
+-           : "w"(a), "w"(b)
++           : "w"(a), "x"(b)
             : /* No clobbers */);
    return result;
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vmull_high_u32 (uint32x4_t a, uint32x4_t b)
--{
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
+ {
 -  uint64x2_t result;
 -  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
++  int32x4_t result;
++  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
+            : "=w"(result)
+            : "w"(a), "w"(b)
+            : /* No clobbers */);
+   return result;
+ }
+ 
+-#define vmull_lane_s16(a, b, c)                                         \
 +#define vqrshrn_high_n_s16(a, b, c)                                     \
-+  __extension__                                                         \
-+    ({                                                                  \
+   __extension__                                                         \
+     ({                                                                  \
+-       int16x4_t b_ = (b);                                              \
 +       int16x8_t b_ = (b);                                              \
 +       int8x8_t a_ = (a);                                               \
 +       int8x16_t result = vcombine_s8                                   \
@@ -16584,12 +18046,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +                : /* No clobbers */);                                   \
 +       result;                                                          \
 +     })
- 
--#define vmull_lane_s16(a, b, c)                                         \
++
 +#define vqrshrn_high_n_s32(a, b, c)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x4_t b_ = (b);                                              \
++  __extension__                                                         \
++    ({                                                                  \
 +       int32x4_t b_ = (b);                                              \
         int16x4_t a_ = (a);                                              \
 -       int32x4_t result;                                                \
@@ -16654,11 +18114,6 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    __extension__                                                         \
      ({                                                                  \
 -       uint32x2_t b_ = (b);                                             \
--       uint32x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
 +       uint32x4_t b_ = (b);                                             \
 +       uint16x4_t a_ = (a);                                             \
 +       uint16x8_t result = vcombine_u16                                 \
@@ -16667,22 +18122,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2"                            \
 +                : "+w"(result)                                          \
 +                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_laneq_s16(a, b, c)                                        \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vqrshrn_high_n_u64(a, b, c)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       int16x4_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
++  __extension__                                                         \
++    ({                                                                  \
 +       uint64x2_t b_ = (b);                                             \
-+       uint32x2_t a_ = (a);                                             \
+        uint32x2_t a_ = (a);                                             \
+-       uint64x2_t result;                                               \
+-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "w"(b_), "i"(c)                              \
 +       uint32x4_t result = vcombine_u32                                 \
 +                             (a_, vcreate_u32                           \
 +                                    (__AARCH64_UINT64_C (0x0)));        \
@@ -16693,17 +18145,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmull_laneq_s32(a, b, c)                                        \
+-#define vmull_laneq_s16(a, b, c)                                        \
 +#define vqrshrun_high_n_s16(a, b, c)                                    \
    __extension__                                                         \
      ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       int32x2_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
+        int16x8_t b_ = (b);                                              \
+-       int16x4_t a_ = (a);                                              \
+-       int32x4_t result;                                                \
+-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
 -                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
-+       int16x8_t b_ = (b);                                              \
+-                : "w"(a_), "x"(b_), "i"(c)                              \
 +       uint8x8_t a_ = (a);                                              \
 +       uint8x16_t result = vcombine_u8                                  \
 +                             (a_, vcreate_u8                            \
@@ -16715,69 +18166,37 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         result;                                                          \
       })
  
--#define vmull_laneq_u16(a, b, c)                                        \
+-#define vmull_laneq_s32(a, b, c)                                        \
 +#define vqrshrun_high_n_s32(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
--       uint16x8_t b_ = (b);                                             \
++  __extension__                                                         \
++    ({                                                                  \
 +       int32x4_t b_ = (b);                                              \
-        uint16x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
++       uint16x4_t a_ = (a);                                             \
 +       uint16x8_t result = vcombine_u16                                 \
 +                             (a_, vcreate_u16                           \
 +                                    (__AARCH64_UINT64_C (0x0)));        \
 +       __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2"                           \
 +                : "+w"(result)                                          \
 +                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_laneq_u32(a, b, c)                                        \
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vqrshrun_high_n_s64(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x4_t b_ = (b);                                             \
++  __extension__                                                         \
++    ({                                                                  \
 +       int64x2_t b_ = (b);                                              \
-        uint32x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
++       uint32x2_t a_ = (a);                                             \
 +       uint32x4_t result = vcombine_u32                                 \
 +                             (a_, vcreate_u32                           \
 +                                    (__AARCH64_UINT64_C (0x0)));        \
 +       __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2"                           \
 +                : "+w"(result)                                          \
 +                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmull_n_s16 (int16x4_t a, int16_t b)
--{
--  int32x4_t result;
--  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmull_n_s32 (int32x2_t a, int32_t b)
--{
--  int64x2_t result;
--  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
++                : /* No clobbers */);                                   \
++       result;                                                          \
++     })
++
 +#define vqshrn_high_n_s16(a, b, c)                                      \
 +  __extension__                                                         \
 +    ({                                                                  \
@@ -16792,21 +18211,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +                : /* No clobbers */);                                   \
 +       result;                                                          \
 +     })
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmull_n_u16 (uint16x4_t a, uint16_t b)
--{
--  uint32x4_t result;
--  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
++
 +#define vqshrn_high_n_s32(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
+   __extension__                                                         \
+     ({                                                                  \
+        int32x4_t b_ = (b);                                              \
 +       int16x4_t a_ = (a);                                              \
 +       int16x8_t result = vcombine_s16                                  \
 +                            (a_, vcreate_s16                            \
@@ -16817,46 +18226,36 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +                : /* No clobbers */);                                   \
 +       result;                                                          \
 +     })
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmull_n_u32 (uint32x2_t a, uint32_t b)
--{
--  uint64x2_t result;
--  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
++
 +#define vqshrn_high_n_s64(a, b, c)                                      \
 +  __extension__                                                         \
 +    ({                                                                  \
 +       int64x2_t b_ = (b);                                              \
-+       int32x2_t a_ = (a);                                              \
+        int32x2_t a_ = (a);                                              \
+-       int64x2_t result;                                                \
+-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "w"(b_), "i"(c)                              \
 +       int32x4_t result = vcombine_s32                                  \
 +                            (a_, vcreate_s32                            \
 +                                   (__AARCH64_UINT64_C (0x0)));         \
 +       __asm__ ("sqshrn2 %0.4s, %1.2d, #%2"                             \
 +                : "+w"(result)                                          \
 +                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
+                 : /* No clobbers */);                                   \
+        result;                                                          \
+      })
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vmull_p8 (poly8x8_t a, poly8x8_t b)
--{
--  poly16x8_t result;
--  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
+-#define vmull_laneq_u16(a, b, c)                                        \
 +#define vqshrn_high_n_u16(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x8_t b_ = (b);                                             \
+   __extension__                                                         \
+     ({                                                                  \
+        uint16x8_t b_ = (b);                                             \
+-       uint16x4_t a_ = (a);                                             \
+-       uint32x4_t result;                                               \
+-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "x"(b_), "i"(c)                              \
 +       uint8x8_t a_ = (a);                                              \
 +       uint8x16_t result = vcombine_u8                                  \
 +                             (a_, vcreate_u8                            \
@@ -16864,24 +18263,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       __asm__ ("uqshrn2 %0.16b, %1.8h, #%2"                            \
 +                : "+w"(result)                                          \
 +                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
+                 : /* No clobbers */);                                   \
+        result;                                                          \
+      })
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmull_s8 (int8x8_t a, int8x8_t b)
--{
--  int16x8_t result;
--  __asm__ ("smull %0.8h, %1.8b, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
+-#define vmull_laneq_u32(a, b, c)                                        \
 +#define vqshrn_high_n_u32(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x4_t b_ = (b);                                             \
+   __extension__                                                         \
+     ({                                                                  \
+        uint32x4_t b_ = (b);                                             \
+-       uint32x2_t a_ = (a);                                             \
+-       uint64x2_t result;                                               \
+-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "w"(b_), "i"(c)                              \
 +       uint16x4_t a_ = (a);                                             \
 +       uint16x8_t result = vcombine_u16                                 \
 +                             (a_, vcreate_u16                           \
@@ -16889,17 +18284,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       __asm__ ("uqshrn2 %0.8h, %1.4s, #%2"                             \
 +                : "+w"(result)                                          \
 +                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
+                 : /* No clobbers */);                                   \
+        result;                                                          \
+      })
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmull_s16 (int16x4_t a, int16x4_t b)
+-vmull_n_s16 (int16x4_t a, int16_t b)
 -{
 -  int32x4_t result;
--  __asm__ ("smull %0.4s, %1.4h, %2.4h"
+-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
 -           : "=w"(result)
--           : "w"(a), "w"(b)
+-           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -16919,10 +18314,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +     })
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmull_s32 (int32x2_t a, int32x2_t b)
+-vmull_n_s32 (int32x2_t a, int32_t b)
 -{
 -  int64x2_t result;
--  __asm__ ("smull %0.2d, %1.2s, %2.2s"
+-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -16943,13 +18338,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmull_u8 (uint8x8_t a, uint8x8_t b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmull_n_u16 (uint16x4_t a, uint16_t b)
 -{
--  uint16x8_t result;
--  __asm__ ("umull %0.8h, %1.8b, %2.8b"
+-  uint32x4_t result;
+-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
 -           : "=w"(result)
--           : "w"(a), "w"(b)
+-           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -16968,11 +18363,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmull_u16 (uint16x4_t a, uint16x4_t b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmull_n_u32 (uint32x2_t a, uint32_t b)
 -{
--  uint32x4_t result;
--  __asm__ ("umull %0.4s, %1.4h, %2.4h"
+-  uint64x2_t result;
+-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -16993,11 +18388,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmull_u32 (uint32x2_t a, uint32x2_t b)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vmull_p8 (poly8x8_t a, poly8x8_t b)
 -{
--  uint64x2_t result;
--  __asm__ ("umull %0.2d, %1.2s, %2.2s"
+-  poly16x8_t result;
+-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -17018,11 +18413,44 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulq_n_f32 (float32x4_t a, float32_t b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmull_s8 (int8x8_t a, int8x8_t b)
 -{
--  float32x4_t result;
--  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
+-  int16x8_t result;
+-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmull_s16 (int16x4_t a, int16x4_t b)
+-{
+-  int32x4_t result;
+-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vmull_s32 (int32x2_t a, int32x2_t b)
+-{
+-  int64x2_t result;
+-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmull_u8 (uint8x8_t a, uint8x8_t b)
+-{
+-  uint16x8_t result;
+-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -17043,11 +18471,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulq_n_f64 (float64x2_t a, float64_t b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmull_u16 (uint16x4_t a, uint16x4_t b)
 -{
--  float64x2_t result;
--  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
+-  uint32x4_t result;
+-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -17068,13 +18496,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmulq_n_s16 (int16x8_t a, int16_t b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vmull_u32 (uint32x2_t a, uint32x2_t b)
 -{
--  int16x8_t result;
--  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
+-  uint64x2_t result;
+-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
 -           : "=w"(result)
--           : "w"(a), "x"(b)
+-           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -17093,11 +18521,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmulq_n_s32 (int32x4_t a, int32_t b)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_f32 (float32x4_t a, float32_t b)
 -{
--  int32x4_t result;
--  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
+-  float32x4_t result;
+-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -17118,13 +18546,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmulq_n_u16 (uint16x8_t a, uint16_t b)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vmulq_n_f64 (float64x2_t a, float64_t b)
 -{
--  uint16x8_t result;
--  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
+-  float64x2_t result;
+-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
 -           : "=w"(result)
--           : "w"(a), "x"(b)
+-           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -17143,13 +18571,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmulq_n_u32 (uint32x4_t a, uint32_t b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmulq_n_s16 (int16x8_t a, int16_t b)
 -{
--  uint32x4_t result;
--  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
+-  int16x8_t result;
+-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
 -           : "=w"(result)
--           : "w"(a), "w"(b)
+-           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -17165,13 +18593,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vmvn_p8 (poly8x8_t a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_s32 (int32x4_t a, int32_t b)
 -{
--  poly8x8_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
+-  int32x4_t result;
+-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
 -           : "=w"(result)
--           : "w"(a)
+-           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -17187,13 +18615,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmvn_s8 (int8x8_t a)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmulq_n_u16 (uint16x8_t a, uint16_t b)
 -{
--  int8x8_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
+-  uint16x8_t result;
+-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
 -           : "=w"(result)
--           : "w"(a)
+-           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -17209,13 +18637,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmvn_s16 (int16x4_t a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_u32 (uint32x4_t a, uint32_t b)
 -{
--  int16x4_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
+-  uint32x4_t result;
+-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
 -           : "=w"(result)
--           : "w"(a)
+-           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -17231,10 +18659,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmvn_s32 (int32x2_t a)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vmvn_p8 (poly8x8_t a)
 -{
--  int32x2_t result;
+-  poly8x8_t result;
 -  __asm__ ("mvn %0.8b,%1.8b"
 -           : "=w"(result)
 -           : "w"(a)
@@ -17252,7 +18680,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +                : /* No clobbers */);                                   \
 +       result;                                                          \
 +     })
-+
+ 
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vmvn_s8 (int8x8_t a)
+-{
+-  int8x8_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
 +#define vrshrn_n_u64(a, b)                                              \
 +  __extension__                                                         \
 +    ({                                                                  \
@@ -17265,13 +18703,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmvn_u8 (uint8x8_t a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vmvn_s16 (int16x4_t a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrte_u32 (uint32x2_t a)
  {
--  uint8x8_t result;
+-  int16x4_t result;
 -  __asm__ ("mvn %0.8b,%1.8b"
 +  uint32x2_t result;
 +  __asm__ ("ursqrte %0.2s,%1.2s"
@@ -17281,13 +18719,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmvn_u16 (uint16x4_t a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vmvn_s32 (int32x2_t a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrteq_u32 (uint32x4_t a)
  {
--  uint16x4_t result;
+-  int32x2_t result;
 -  __asm__ ("mvn %0.8b,%1.8b"
 +  uint32x4_t result;
 +  __asm__ ("ursqrte %0.4s,%1.4s"
@@ -17297,13 +18735,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmvn_u32 (uint32x2_t a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vmvn_u8 (uint8x8_t a)
 -{
--  uint32x2_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
--           : "=w"(result)
--           : "w"(a)
 +#define vshrn_high_n_s16(a, b, c)                                       \
 +  __extension__                                                         \
 +    ({                                                                  \
@@ -17544,6 +18978,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
 +
++#define vsri_n_p64(a, b, c)						\
++  __extension__								\
++    ({									\
++       poly64x1_t b_ = (b);						\
++       poly64x1_t a_ = (a);						\
++       poly64x1_t result;						\
++       __asm__ ("sri %d0,%d2,%3"					\
++		: "=w"(result)						\
++		: "0"(a_), "w"(b_), "i"(c)				\
++		: /* No clobbers.  */);					\
++       result;								\
++     })
++
 +#define vsriq_n_p8(a, b, c)                                             \
 +  __extension__                                                         \
 +    ({                                                                  \
@@ -17570,6 +19017,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +       result;                                                          \
 +     })
 +
++#define vsriq_n_p64(a, b, c)						\
++  __extension__								\
++    ({									\
++       poly64x2_t b_ = (b);						\
++       poly64x2_t a_ = (a);						\
++       poly64x2_t result;						\
++       __asm__ ("sri %0.2d,%2.2d,%3"					\
++		: "=w"(result)						\
++		: "0"(a_), "w"(b_), "i"(c)				\
++		: /* No clobbers.  */);					\
++       result;								\
++     })
++
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vtst_p8 (poly8x8_t a, poly8x8_t b)
@@ -17737,6 +19197,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +		 int8x16_t)
 +__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
 +		 int16x8_t)
++__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
++		 poly64x2_t)
 +__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
 +		 int8x16_t)
 +__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
@@ -17772,6 +19234,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
 +__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
 +__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
++__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
 +__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
 +__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
 +__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
@@ -17819,6 +19282,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +		 int8x16_t)
 +__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
 +		 int16x8_t)
++__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64,
++		 poly64x2_t)
 +__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
 +		 int8x16_t)
 +__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
@@ -17854,6 +19319,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
 +__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
 +__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
++__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
 +__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
 +__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
 +__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
@@ -17906,6 +19372,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +		 int8x16_t)
 +__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
 +		 int16x8_t)
++__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64,
++		 poly64x2_t)
 +__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
 +		 int8x16_t)
 +__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
@@ -17941,6 +19409,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
 +__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
 +__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
++__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
 +__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
 +__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
 +__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
@@ -18189,10 +19658,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
 +{
-+  uint8x8_t result;
+   uint8x8_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
 +  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
 +  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
+            : "=w"(result)
+-           : "w"(a)
 +           : "w"(temp), "w"(idx)
 +           : /* No clobbers */);
 +  return result;
@@ -18391,8 +19862,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return result;
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vmvnq_p8 (poly8x16_t a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vmvn_u16 (uint16x4_t a)
 +/* End of temporary inline asm.  */
 +
 +/* Start of optimal implementations in approved order.  */
@@ -18702,6 +20173,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +{
 +  return __builtin_aarch64_simd_bslv4hi_pupp (__a, __b, __c);
 +}
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbsl_p64 (uint64x1_t __a, poly64x1_t __b, poly64x1_t __c)
++{
++  return (poly64x1_t)
++      {__builtin_aarch64_simd_bsldi_pupp (__a[0], __b[0], __c[0])};
++}
 +
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -18810,6 +20288,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_simd_bslv8hi_suss (__a, __b, __c);
 +}
 +
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbslq_p64 (uint64x2_t __a, poly64x2_t __b, poly64x2_t __c)
++{
++  return __builtin_aarch64_simd_bslv2di_pupp (__a, __b, __c);
++}
++
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
@@ -18943,140 +20428,70 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlsh_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
- {
--  poly8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return  __builtin_aarch64_sqrdmlsh_laneqv4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmvnq_s8 (int8x16_t a)
++}
++
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlsh_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
- {
--  int8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return __builtin_aarch64_sqrdmlsh_laneqv2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmvnq_s16 (int16x8_t a)
++}
++
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlshq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
- {
--  int16x8_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return __builtin_aarch64_sqrdmlsh_laneqv8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmvnq_s32 (int32x4_t a)
++}
++
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlshq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
- {
--  int32x4_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return __builtin_aarch64_sqrdmlsh_laneqv4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmvnq_u8 (uint8x16_t a)
++}
++
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
--  uint8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return  __builtin_aarch64_sqrdmlah_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmvnq_u16 (uint16x8_t a)
++}
++
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
--  uint16x8_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return __builtin_aarch64_sqrdmlah_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmvnq_u32 (uint32x4_t a)
++}
++
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
- {
--  uint32x4_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
++{
 +  return __builtin_aarch64_sqrdmlah_lanev8hi (__a, __b, __c, __d);
- }
- 
++}
++
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
 +{
 +  return __builtin_aarch64_sqrdmlah_lanev4si (__a, __b, __c, __d);
 +}
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vpadal_s8 (int16x4_t a, int8x8_t b)
++
 +__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlahh_s16 (int16_t __a, int16_t __b, int16_t __c)
- {
--  int16x4_t result;
--  __asm__ ("sadalp %0.4h,%2.8b"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
++{
 +  return (int16_t) __builtin_aarch64_sqrdmlahhi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vpadal_s16 (int32x2_t a, int16x4_t b)
++}
++
 +__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqrdmlahh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
- {
--  int32x2_t result;
--  __asm__ ("sadalp %0.2s,%2.4h"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
++{
 +  return __builtin_aarch64_sqrdmlah_lanehi (__a, __b, __c, __d);
 +}
 +
@@ -19186,66 +20601,136 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vaeseq_u8 (uint8x16_t data, uint8x16_t key)
-+{
+ {
+-  uint16x4_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_aarch64_crypto_aesev16qi_uuu (data, key);
-+}
-+
+ }
+ 
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vmvn_u32 (uint32x2_t a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vaesdq_u8 (uint8x16_t data, uint8x16_t key)
-+{
+ {
+-  uint32x2_t result;
+-  __asm__ ("mvn %0.8b,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_aarch64_crypto_aesdv16qi_uuu (data, key);
-+}
-+
+ }
+ 
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vmvnq_p8 (poly8x16_t a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vaesmcq_u8 (uint8x16_t data)
-+{
+ {
+-  poly8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_aarch64_crypto_aesmcv16qi_uu (data);
-+}
-+
+ }
+ 
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vmvnq_s8 (int8x16_t a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vaesimcq_u8 (uint8x16_t data)
-+{
+ {
+-  int8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_aarch64_crypto_aesimcv16qi_uu (data);
-+}
+ }
 +#pragma GCC pop_options
 +
 +/* vcage  */
-+
+ 
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmvnq_s16 (int16x8_t a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcage_f64 (float64x1_t __a, float64x1_t __b)
-+{
+ {
+-  int16x8_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return vabs_f64 (__a) >= vabs_f64 (__b);
-+}
-+
+ }
+ 
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmvnq_s32 (int32x4_t a)
 +__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcages_f32 (float32_t __a, float32_t __b)
-+{
+ {
+-  int32x4_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
-+}
-+
+ }
+ 
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vmvnq_u8 (uint8x16_t a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcage_f32 (float32x2_t __a, float32x2_t __b)
-+{
+ {
+-  uint8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return vabs_f32 (__a) >= vabs_f32 (__b);
-+}
-+
+ }
+ 
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmvnq_u16 (uint16x8_t a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcageq_f32 (float32x4_t __a, float32x4_t __b)
-+{
+ {
+-  uint16x8_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return vabsq_f32 (__a) >= vabsq_f32 (__b);
-+}
-+
+ }
+ 
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmvnq_u32 (uint32x4_t a)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcaged_f64 (float64_t __a, float64_t __b)
-+{
+ {
+-  uint32x4_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
 +}
 +
@@ -19254,14 +20739,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +vcageq_f64 (float64x2_t __a, float64x2_t __b)
 +{
 +  return vabsq_f64 (__a) >= vabsq_f64 (__b);
-+}
-+
+ }
+ 
 +/* vcagt  */
-+
+ 
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vpadal_s8 (int16x4_t a, int8x8_t b)
 +__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vcagts_f32 (float32_t __a, float32_t __b)
-+{
+ {
+-  int16x4_t result;
+-  __asm__ ("sadalp %0.4h,%2.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
 +  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
 +}
 +
@@ -19335,13 +20828,193 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +vcaleq_f32 (float32x4_t __a, float32x4_t __b)
 +{
 +  return vabsq_f32 (__a) <= vabsq_f32 (__b);
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcaleq_f64 (float64x2_t __a, float64x2_t __b)
++{
++  return vabsq_f64 (__a) <= vabsq_f64 (__b);
++}
++
++/* vcalt  */
++
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcalt_f32 (float32x2_t __a, float32x2_t __b)
++{
++  return vabs_f32 (__a) < vabs_f32 (__b);
++}
++
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcalt_f64 (float64x1_t __a, float64x1_t __b)
++{
++  return vabs_f64 (__a) < vabs_f64 (__b);
++}
++
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcaltd_f64 (float64_t __a, float64_t __b)
++{
++  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
++}
++
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcaltq_f32 (float32x4_t __a, float32x4_t __b)
++{
++  return vabsq_f32 (__a) < vabsq_f32 (__b);
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcaltq_f64 (float64x2_t __a, float64x2_t __b)
++{
++  return vabsq_f64 (__a) < vabsq_f64 (__b);
++}
++
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcalts_f32 (float32_t __a, float32_t __b)
++{
++  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
++}
++
++/* vceq - vector.  */
++
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_f32 (float32x2_t __a, float32x2_t __b)
++{
++  return (uint32x2_t) (__a == __b);
++}
++
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_f64 (float64x1_t __a, float64x1_t __b)
++{
++  return (uint64x1_t) (__a == __b);
++}
++
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_p8 (poly8x8_t __a, poly8x8_t __b)
++{
++  return (uint8x8_t) (__a == __b);
++}
++
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_p64 (poly64x1_t __a, poly64x1_t __b)
++{
++  return (uint64x1_t) (__a == __b);
++}
++
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_s8 (int8x8_t __a, int8x8_t __b)
++{
++  return (uint8x8_t) (__a == __b);
++}
++
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_s16 (int16x4_t __a, int16x4_t __b)
++{
++  return (uint16x4_t) (__a == __b);
++}
++
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_s32 (int32x2_t __a, int32x2_t __b)
++{
++  return (uint32x2_t) (__a == __b);
++}
++
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_s64 (int64x1_t __a, int64x1_t __b)
++{
++  return (uint64x1_t) (__a == __b);
++}
++
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_u8 (uint8x8_t __a, uint8x8_t __b)
++{
++  return (__a == __b);
++}
++
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_u16 (uint16x4_t __a, uint16x4_t __b)
++{
++  return (__a == __b);
++}
++
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_u32 (uint32x2_t __a, uint32x2_t __b)
++{
++  return (__a == __b);
++}
++
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceq_u64 (uint64x1_t __a, uint64x1_t __b)
++{
++  return (__a == __b);
++}
++
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqq_f32 (float32x4_t __a, float32x4_t __b)
++{
++  return (uint32x4_t) (__a == __b);
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqq_f64 (float64x2_t __a, float64x2_t __b)
++{
++  return (uint64x2_t) (__a == __b);
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
++{
++  return (uint8x16_t) (__a == __b);
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqq_s8 (int8x16_t __a, int8x16_t __b)
++{
++  return (uint8x16_t) (__a == __b);
+ }
+ 
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vpadal_s16 (int32x2_t a, int16x4_t b)
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  int32x2_t result;
+-  __asm__ ("sadalp %0.2s,%2.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
++  return (uint16x8_t) (__a == __b);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vpadal_s32 (int64x1_t a, int32x2_t b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaleq_f64 (float64x2_t __a, float64x2_t __b)
++vceqq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  int64x1_t result;
 -  __asm__ ("sadalp %0.1d,%2.2s"
@@ -19349,16 +21022,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return vabsq_f64 (__a) <= vabsq_f64 (__b);
++  return (uint32x4_t) (__a == __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vpadal_u8 (uint16x4_t a, uint8x8_t b)
-+/* vcalt  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalt_f32 (float32x2_t __a, float32x2_t __b)
++vceqq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  uint16x4_t result;
 -  __asm__ ("uadalp %0.4h,%2.8b"
@@ -19366,14 +21037,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return vabs_f32 (__a) < vabs_f32 (__b);
++  return (uint64x2_t) (__a == __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vpadal_u16 (uint32x2_t a, uint16x4_t b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalt_f64 (float64x1_t __a, float64x1_t __b)
++vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
  {
 -  uint32x2_t result;
 -  __asm__ ("uadalp %0.2s,%2.4h"
@@ -19381,14 +21052,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return vabs_f64 (__a) < vabs_f64 (__b);
++  return (__a == __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vpadal_u32 (uint64x1_t a, uint32x2_t b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltd_f64 (float64_t __a, float64_t __b)
++vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
  {
 -  uint64x1_t result;
 -  __asm__ ("uadalp %0.1d,%2.2s"
@@ -19396,14 +21067,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
++  return (__a == __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vpadalq_s8 (int16x8_t a, int8x16_t b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltq_f32 (float32x4_t __a, float32x4_t __b)
++vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
  {
 -  int16x8_t result;
 -  __asm__ ("sadalp %0.8h,%2.16b"
@@ -19411,14 +21082,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return vabsq_f32 (__a) < vabsq_f32 (__b);
++  return (__a == __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vpadalq_s16 (int32x4_t a, int16x8_t b)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltq_f64 (float64x2_t __a, float64x2_t __b)
++vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
  {
 -  int32x4_t result;
 -  __asm__ ("sadalp %0.4s,%2.8h"
@@ -19426,14 +21097,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return vabsq_f64 (__a) < vabsq_f64 (__b);
++  return (__a == __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vpadalq_s32 (int64x2_t a, int32x4_t b)
++/* vceq - scalar.  */
++
 +__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalts_f32 (float32_t __a, float32_t __b)
++vceqs_f32 (float32_t __a, float32_t __b)
  {
 -  int64x2_t result;
 -  __asm__ ("sadalp %0.2d,%2.4s"
@@ -19441,16 +21114,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
++  return __a == __b ? -1 : 0;
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vpadalq_u8 (uint16x8_t a, uint8x16_t b)
-+/* vceq - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_f32 (float32x2_t __a, float32x2_t __b)
++vceqd_s64 (int64_t __a, int64_t __b)
  {
 -  uint16x8_t result;
 -  __asm__ ("uadalp %0.8h,%2.16b"
@@ -19458,14 +21129,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a == __b);
++  return __a == __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vpadalq_u16 (uint32x4_t a, uint16x8_t b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_f64 (float64x1_t __a, float64x1_t __b)
++vceqd_u64 (uint64_t __a, uint64_t __b)
  {
 -  uint32x4_t result;
 -  __asm__ ("uadalp %0.4s,%2.8h"
@@ -19473,14 +21144,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a == __b);
++  return __a == __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vpadalq_u32 (uint64x2_t a, uint32x4_t b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_p8 (poly8x8_t __a, poly8x8_t __b)
++vceqd_f64 (float64_t __a, float64_t __b)
  {
 -  uint64x2_t result;
 -  __asm__ ("uadalp %0.2d,%2.4s"
@@ -19488,14 +21159,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "0"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x8_t) (__a == __b);
++  return __a == __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vpadd_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline uint8x8_t
++/* vceqz - vector.  */
++
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s8 (int8x8_t __a, int8x8_t __b)
++vceqz_f32 (float32x2_t __a)
  {
 -  float32x2_t result;
 -  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
@@ -19503,14 +21176,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x8_t) (__a == __b);
++  return (uint32x2_t) (__a == 0.0f);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vpaddl_s8 (int8x8_t a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s16 (int16x4_t __a, int16x4_t __b)
++vceqz_f64 (float64x1_t __a)
  {
 -  int16x4_t result;
 -  __asm__ ("saddlp %0.4h,%1.8b"
@@ -19518,14 +21191,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x4_t) (__a == __b);
++  return (uint64x1_t) (__a == (float64x1_t) {0.0});
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vpaddl_s16 (int16x4_t a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s32 (int32x2_t __a, int32x2_t __b)
++vceqz_p8 (poly8x8_t __a)
  {
 -  int32x2_t result;
 -  __asm__ ("saddlp %0.2s,%1.4h"
@@ -19533,14 +21206,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a == __b);
++  return (uint8x8_t) (__a == 0);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vpaddl_s32 (int32x2_t a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s64 (int64x1_t __a, int64x1_t __b)
++vceqz_s8 (int8x8_t __a)
  {
 -  int64x1_t result;
 -  __asm__ ("saddlp %0.1d,%1.2s"
@@ -19548,14 +21221,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a == __b);
++  return (uint8x8_t) (__a == 0);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vpaddl_u8 (uint8x8_t a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u8 (uint8x8_t __a, uint8x8_t __b)
++vceqz_s16 (int16x4_t __a)
  {
 -  uint16x4_t result;
 -  __asm__ ("uaddlp %0.4h,%1.8b"
@@ -19563,14 +21236,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
++  return (uint16x4_t) (__a == 0);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vpaddl_u16 (uint16x4_t a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u16 (uint16x4_t __a, uint16x4_t __b)
++vceqz_s32 (int32x2_t __a)
  {
 -  uint32x2_t result;
 -  __asm__ ("uaddlp %0.2s,%1.4h"
@@ -19578,14 +21251,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
++  return (uint32x2_t) (__a == 0);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vpaddl_u32 (uint32x2_t a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u32 (uint32x2_t __a, uint32x2_t __b)
++vceqz_s64 (int64x1_t __a)
  {
 -  uint64x1_t result;
 -  __asm__ ("uaddlp %0.1d,%1.2s"
@@ -19593,14 +21266,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
++  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vpaddlq_s8 (int8x16_t a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u64 (uint64x1_t __a, uint64x1_t __b)
++vceqz_u8 (uint8x8_t __a)
  {
 -  int16x8_t result;
 -  __asm__ ("saddlp %0.8h,%1.16b"
@@ -19608,14 +21281,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
++  return (__a == 0);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vpaddlq_s16 (int16x8_t a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_f32 (float32x4_t __a, float32x4_t __b)
++vceqz_u16 (uint16x4_t __a)
  {
 -  int32x4_t result;
 -  __asm__ ("saddlp %0.4s,%1.8h"
@@ -19623,14 +21296,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a == __b);
++  return (__a == 0);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vpaddlq_s32 (int32x4_t a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_f64 (float64x2_t __a, float64x2_t __b)
++vceqz_u32 (uint32x2_t __a)
  {
 -  int64x2_t result;
 -  __asm__ ("saddlp %0.2d,%1.4s"
@@ -19638,14 +21311,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a == __b);
++  return (__a == 0);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vpaddlq_u8 (uint8x16_t a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
++vceqz_u64 (uint64x1_t __a)
  {
 -  uint16x8_t result;
 -  __asm__ ("uaddlp %0.8h,%1.16b"
@@ -19653,14 +21326,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x16_t) (__a == __b);
++  return (__a == __AARCH64_UINT64_C (0));
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vpaddlq_u16 (uint16x8_t a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s8 (int8x16_t __a, int8x16_t __b)
++vceqzq_f32 (float32x4_t __a)
  {
 -  uint32x4_t result;
 -  __asm__ ("uaddlp %0.4s,%1.8h"
@@ -19668,14 +21341,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x16_t) (__a == __b);
++  return (uint32x4_t) (__a == 0.0f);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vpaddlq_u32 (uint32x4_t a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s16 (int16x8_t __a, int16x8_t __b)
++vceqzq_f64 (float64x2_t __a)
  {
 -  uint64x2_t result;
 -  __asm__ ("uaddlp %0.2d,%1.4s"
@@ -19683,14 +21356,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x8_t) (__a == __b);
++  return (uint64x2_t) (__a == 0.0f);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vpaddq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s32 (int32x4_t __a, int32x4_t __b)
++vceqzq_p8 (poly8x16_t __a)
  {
 -  float32x4_t result;
 -  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
@@ -19698,14 +21371,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a == __b);
++  return (uint8x16_t) (__a == 0);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vpaddq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s64 (int64x2_t __a, int64x2_t __b)
++vceqzq_s8 (int8x16_t __a)
  {
 -  float64x2_t result;
 -  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
@@ -19713,14 +21386,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a == __b);
++  return (uint8x16_t) (__a == 0);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vpaddq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
++vceqzq_s16 (int16x8_t __a)
  {
 -  int8x16_t result;
 -  __asm__ ("addp %0.16b,%1.16b,%2.16b"
@@ -19728,14 +21401,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
++  return (uint16x8_t) (__a == 0);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vpaddq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
++vceqzq_s32 (int32x4_t __a)
  {
 -  int16x8_t result;
 -  __asm__ ("addp %0.8h,%1.8h,%2.8h"
@@ -19743,14 +21416,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
++  return (uint32x4_t) (__a == 0);
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqzq_s64 (int64x2_t __a)
++{
++  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vceqzq_u8 (uint8x16_t __a)
++{
++  return (__a == 0);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vpaddq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
++vceqzq_u16 (uint16x8_t __a)
  {
 -  int32x4_t result;
 -  __asm__ ("addp %0.4s,%1.4s,%2.4s"
@@ -19758,23 +21445,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __b);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
-+{
-+  return (__a == __b);
++  return (__a == 0);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vpaddq_s64 (int64x2_t a, int64x2_t b)
-+/* vceq - scalar.  */
-+
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqs_f32 (float32_t __a, float32_t __b)
++vceqzq_u32 (uint32x4_t __a)
  {
 -  int64x2_t result;
 -  __asm__ ("addp %0.2d,%1.2d,%2.2d"
@@ -19782,14 +21460,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return __a == __b ? -1 : 0;
++  return (__a == 0);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vpaddq_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqd_s64 (int64_t __a, int64_t __b)
++vceqzq_u64 (uint64x2_t __a)
  {
 -  uint8x16_t result;
 -  __asm__ ("addp %0.16b,%1.16b,%2.16b"
@@ -19797,14 +21475,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return __a == __b ? -1ll : 0ll;
++  return (__a == __AARCH64_UINT64_C (0));
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vpaddq_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline uint64_t
++/* vceqz - scalar.  */
++
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqd_u64 (uint64_t __a, uint64_t __b)
++vceqzs_f32 (float32_t __a)
  {
 -  uint16x8_t result;
 -  __asm__ ("addp %0.8h,%1.8h,%2.8h"
@@ -19812,14 +21492,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return __a == __b ? -1ll : 0ll;
++  return __a == 0.0f ? -1 : 0;
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vpaddq_u32 (uint32x4_t a, uint32x4_t b)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqd_f64 (float64_t __a, float64_t __b)
++vceqzd_s64 (int64_t __a)
  {
 -  uint32x4_t result;
 -  __asm__ ("addp %0.4s,%1.4s,%2.4s"
@@ -19827,16 +21507,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return __a == __b ? -1ll : 0ll;
++  return __a == 0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vpaddq_u64 (uint64x2_t a, uint64x2_t b)
-+/* vceqz - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_f32 (float32x2_t __a)
++vceqzd_u64 (uint64_t __a)
  {
 -  uint64x2_t result;
 -  __asm__ ("addp %0.2d,%1.2d,%2.2d"
@@ -19844,14 +21522,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a == 0.0f);
++  return __a == 0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vpadds_f32 (float32x2_t a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_f64 (float64x1_t __a)
++vceqzd_f64 (float64_t __a)
  {
 -  float32_t result;
 -  __asm__ ("faddp %s0,%1.2s"
@@ -19859,14 +21537,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a == (float64x1_t) {0.0});
++  return __a == 0.0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqdmulh_n_s16 (int16x4_t a, int16_t b)
-+__extension__ extern __inline uint8x8_t
++/* vcge - vector.  */
++
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_p8 (poly8x8_t __a)
++vcge_f32 (float32x2_t __a, float32x2_t __b)
  {
 -  int16x4_t result;
 -  __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
@@ -19874,14 +21554,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x8_t) (__a == 0);
++  return (uint32x2_t) (__a >= __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqdmulh_n_s32 (int32x2_t a, int32_t b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s8 (int8x8_t __a)
++vcge_f64 (float64x1_t __a, float64x1_t __b)
  {
 -  int32x2_t result;
 -  __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
@@ -19889,14 +21569,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x8_t) (__a == 0);
++  return (uint64x1_t) (__a >= __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqdmulhq_n_s16 (int16x8_t a, int16_t b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s16 (int16x4_t __a)
++vcge_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  int16x8_t result;
 -  __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
@@ -19904,14 +21584,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x4_t) (__a == 0);
++  return (uint8x8_t) (__a >= __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmulhq_n_s32 (int32x4_t a, int32_t b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s32 (int32x2_t __a)
++vcge_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  int32x4_t result;
 -  __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
@@ -19919,14 +21599,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a == 0);
++  return (uint16x4_t) (__a >= __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqmovn_high_s16 (int8x8_t a, int16x8_t b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s64 (int64x1_t __a)
++vcge_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("sqxtn2 %0.16b, %1.8h"
@@ -19934,14 +21614,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
++  return (uint32x2_t) (__a >= __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqmovn_high_s32 (int16x4_t a, int32x4_t b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u8 (uint8x8_t __a)
++vcge_s64 (int64x1_t __a, int64x1_t __b)
  {
 -  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("sqxtn2 %0.8h, %1.4s"
@@ -19949,14 +21629,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == 0);
++  return (uint64x1_t) (__a >= __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqmovn_high_s64 (int32x2_t a, int64x2_t b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u16 (uint16x4_t __a)
++vcge_u8 (uint8x8_t __a, uint8x8_t __b)
  {
 -  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("sqxtn2 %0.4s, %1.2d"
@@ -19964,14 +21644,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == 0);
++  return (__a >= __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u32 (uint32x2_t __a)
++vcge_u16 (uint16x4_t __a, uint16x4_t __b)
  {
 -  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("uqxtn2 %0.16b, %1.8h"
@@ -19979,14 +21659,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == 0);
++  return (__a >= __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u64 (uint64x1_t __a)
++vcge_u32 (uint32x2_t __a, uint32x2_t __b)
  {
 -  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("uqxtn2 %0.8h, %1.4s"
@@ -19994,14 +21674,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == __AARCH64_UINT64_C (0));
++  return (__a >= __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_f32 (float32x4_t __a)
++vcge_u64 (uint64x1_t __a, uint64x1_t __b)
  {
 -  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("uqxtn2 %0.4s, %1.2d"
@@ -20009,14 +21689,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a == 0.0f);
++  return (__a >= __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_f64 (float64x2_t __a)
++vcgeq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("sqxtun2 %0.16b, %1.8h"
@@ -20024,14 +21704,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a == 0.0f);
++  return (uint32x4_t) (__a >= __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_p8 (poly8x16_t __a)
++vcgeq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("sqxtun2 %0.8h, %1.4s"
@@ -20039,14 +21719,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x16_t) (__a == 0);
++  return (uint64x2_t) (__a >= __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s8 (int8x16_t __a)
++vcgeq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
 -  __asm__ ("sqxtun2 %0.4s, %1.2d"
@@ -20054,14 +21734,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x16_t) (__a == 0);
++  return (uint8x16_t) (__a >= __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmulh_n_s16 (int16x4_t a, int16_t b)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s16 (int16x8_t __a)
++vcgeq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  int16x4_t result;
 -  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
@@ -20069,14 +21749,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x8_t) (__a == 0);
++  return (uint16x8_t) (__a >= __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmulh_n_s32 (int32x2_t a, int32_t b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s32 (int32x4_t __a)
++vcgeq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  int32x2_t result;
 -  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
@@ -20084,14 +21764,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a == 0);
++  return (uint32x4_t) (__a >= __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s64 (int64x2_t __a)
++vcgeq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  int16x8_t result;
 -  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
@@ -20099,14 +21779,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "x"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
++  return (uint64x2_t) (__a >= __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u8 (uint8x16_t __a)
++vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
  {
 -  int32x4_t result;
 -  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
@@ -20114,7 +21794,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a == 0);
++  return (__a >= __b);
  }
  
 -#define vqrshrn_high_n_s16(a, b, c)                                     \
@@ -20131,7 +21811,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
++{
++  return (__a >= __b);
++}
+ 
 -#define vqrshrn_high_n_s32(a, b, c)                                     \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -20146,7 +21832,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
++{
++  return (__a >= __b);
++}
+ 
 -#define vqrshrn_high_n_s64(a, b, c)                                     \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -20161,11 +21853,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u16 (uint16x8_t __a)
++vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
 +{
-+  return (__a == 0);
++  return (__a >= __b);
 +}
  
 -#define vqrshrn_high_n_u16(a, b, c)                                     \
@@ -20182,12 +21874,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u32 (uint32x4_t __a)
-+{
-+  return (__a == 0);
-+}
++/* vcge - scalar.  */
  
 -#define vqrshrn_high_n_u32(a, b, c)                                     \
 -  __extension__                                                         \
@@ -20203,11 +21890,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u64 (uint64x2_t __a)
++vcges_f32 (float32_t __a, float32_t __b)
 +{
-+  return (__a == __AARCH64_UINT64_C (0));
++  return __a >= __b ? -1 : 0;
 +}
  
 -#define vqrshrn_high_n_u64(a, b, c)                                     \
@@ -20224,7 +21911,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+/* vceqz - scalar.  */
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcged_s64 (int64_t __a, int64_t __b)
++{
++  return __a >= __b ? -1ll : 0ll;
++}
  
 -#define vqrshrun_high_n_s16(a, b, c)                                    \
 -  __extension__                                                         \
@@ -20240,11 +21932,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzs_f32 (float32_t __a)
++vcged_u64 (uint64_t __a, uint64_t __b)
 +{
-+  return __a == 0.0f ? -1 : 0;
++  return __a >= __b ? -1ll : 0ll;
 +}
  
 -#define vqrshrun_high_n_s32(a, b, c)                                    \
@@ -20263,9 +21955,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -     })
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzd_s64 (int64_t __a)
++vcged_f64 (float64_t __a, float64_t __b)
 +{
-+  return __a == 0 ? -1ll : 0ll;
++  return __a >= __b ? -1ll : 0ll;
 +}
  
 -#define vqrshrun_high_n_s64(a, b, c)                                    \
@@ -20282,12 +21974,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzd_u64 (uint64_t __a)
-+{
-+  return __a == 0 ? -1ll : 0ll;
-+}
++/* vcgez - vector.  */
  
 -#define vqshrn_high_n_s16(a, b, c)                                      \
 -  __extension__                                                         \
@@ -20303,11 +21990,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzd_f64 (float64_t __a)
++vcgez_f32 (float32x2_t __a)
 +{
-+  return __a == 0.0 ? -1ll : 0ll;
++  return (uint32x2_t) (__a >= 0.0f);
 +}
  
 -#define vqshrn_high_n_s32(a, b, c)                                      \
@@ -20324,7 +22011,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+/* vcge - vector.  */
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgez_f64 (float64x1_t __a)
++{
++  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
++}
  
 -#define vqshrn_high_n_s64(a, b, c)                                      \
 -  __extension__                                                         \
@@ -20340,11 +22032,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_f32 (float32x2_t __a, float32x2_t __b)
++vcgez_s8 (int8x8_t __a)
 +{
-+  return (uint32x2_t) (__a >= __b);
++  return (uint8x8_t) (__a >= 0);
 +}
  
 -#define vqshrn_high_n_u16(a, b, c)                                      \
@@ -20361,11 +22053,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_f64 (float64x1_t __a, float64x1_t __b)
++vcgez_s16 (int16x4_t __a)
 +{
-+  return (uint64x1_t) (__a >= __b);
++  return (uint16x4_t) (__a >= 0);
 +}
  
 -#define vqshrn_high_n_u32(a, b, c)                                      \
@@ -20382,11 +22074,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s8 (int8x8_t __a, int8x8_t __b)
++vcgez_s32 (int32x2_t __a)
 +{
-+  return (uint8x8_t) (__a >= __b);
++  return (uint32x2_t) (__a >= 0);
 +}
  
 -#define vqshrn_high_n_u64(a, b, c)                                      \
@@ -20403,11 +22095,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s16 (int16x4_t __a, int16x4_t __b)
++vcgez_s64 (int64x1_t __a)
 +{
-+  return (uint16x4_t) (__a >= __b);
++  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
 +}
  
 -#define vqshrun_high_n_s16(a, b, c)                                     \
@@ -20424,11 +22116,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s32 (int32x2_t __a, int32x2_t __b)
++vcgezq_f32 (float32x4_t __a)
 +{
-+  return (uint32x2_t) (__a >= __b);
++  return (uint32x4_t) (__a >= 0.0f);
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgezq_f64 (float64x2_t __a)
++{
++  return (uint64x2_t) (__a >= 0.0);
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgezq_s8 (int8x16_t __a)
++{
++  return (uint8x16_t) (__a >= 0);
 +}
  
 -#define vqshrun_high_n_s32(a, b, c)                                     \
@@ -20445,25 +22151,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s64 (int64x1_t __a, int64x1_t __b)
-+{
-+  return (uint64x1_t) (__a >= __b);
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u8 (uint8x8_t __a, uint8x8_t __b)
-+{
-+  return (__a >= __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u16 (uint16x4_t __a, uint16x4_t __b)
++vcgezq_s16 (int16x8_t __a)
 +{
-+  return (__a >= __b);
++  return (uint16x8_t) (__a >= 0);
 +}
  
 -#define vqshrun_high_n_s64(a, b, c)                                     \
@@ -20480,11 +22172,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u32 (uint32x2_t __a, uint32x2_t __b)
++vcgezq_s32 (int32x4_t __a)
 +{
-+  return (__a >= __b);
++  return (uint32x4_t) (__a >= 0);
 +}
  
 -#define vrshrn_high_n_s16(a, b, c)                                      \
@@ -20501,11 +22193,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u64 (uint64x1_t __a, uint64x1_t __b)
++vcgezq_s64 (int64x2_t __a)
 +{
-+  return (__a >= __b);
++  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
 +}
  
 -#define vrshrn_high_n_s32(a, b, c)                                      \
@@ -20522,12 +22214,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return (uint32x4_t) (__a >= __b);
-+}
++/* vcgez - scalar.  */
  
 -#define vrshrn_high_n_s64(a, b, c)                                      \
 -  __extension__                                                         \
@@ -20543,11 +22230,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_f64 (float64x2_t __a, float64x2_t __b)
++vcgezs_f32 (float32_t __a)
 +{
-+  return (uint64x2_t) (__a >= __b);
++  return __a >= 0.0f ? -1 : 0;
 +}
  
 -#define vrshrn_high_n_u16(a, b, c)                                      \
@@ -20564,11 +22251,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s8 (int8x16_t __a, int8x16_t __b)
++vcgezd_s64 (int64_t __a)
 +{
-+  return (uint8x16_t) (__a >= __b);
++  return __a >= 0 ? -1ll : 0ll;
 +}
  
 -#define vrshrn_high_n_u32(a, b, c)                                      \
@@ -20585,11 +22272,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s16 (int16x8_t __a, int16x8_t __b)
++vcgezd_f64 (float64_t __a)
 +{
-+  return (uint16x8_t) (__a >= __b);
++  return __a >= 0.0 ? -1ll : 0ll;
 +}
  
 -#define vrshrn_high_n_u64(a, b, c)                                      \
@@ -20606,12 +22293,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s32 (int32x4_t __a, int32x4_t __b)
-+{
-+  return (uint32x4_t) (__a >= __b);
-+}
++/* vcgt - vector.  */
  
 -#define vrshrn_n_s16(a, b)                                              \
 -  __extension__                                                         \
@@ -20624,11 +22306,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s64 (int64x2_t __a, int64x2_t __b)
++vcgt_f32 (float32x2_t __a, float32x2_t __b)
 +{
-+  return (uint64x2_t) (__a >= __b);
++  return (uint32x2_t) (__a > __b);
 +}
  
 -#define vrshrn_n_s32(a, b)                                              \
@@ -20642,11 +22324,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
++vcgt_f64 (float64x1_t __a, float64x1_t __b)
 +{
-+  return (__a >= __b);
++  return (uint64x1_t) (__a > __b);
 +}
  
 -#define vrshrn_n_s64(a, b)                                              \
@@ -20660,11 +22342,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
++vcgt_s8 (int8x8_t __a, int8x8_t __b)
 +{
-+  return (__a >= __b);
++  return (uint8x8_t) (__a > __b);
 +}
  
 -#define vrshrn_n_u16(a, b)                                              \
@@ -20678,11 +22360,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
++vcgt_s16 (int16x4_t __a, int16x4_t __b)
 +{
-+  return (__a >= __b);
++  return (uint16x4_t) (__a > __b);
 +}
  
 -#define vrshrn_n_u32(a, b)                                              \
@@ -20696,11 +22378,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
++vcgt_s32 (int32x2_t __a, int32x2_t __b)
 +{
-+  return (__a >= __b);
++  return (uint32x2_t) (__a > __b);
 +}
  
 -#define vrshrn_n_u64(a, b)                                              \
@@ -20714,13 +22396,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+/* vcge - scalar.  */
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgt_s64 (int64x1_t __a, int64x1_t __b)
++{
++  return (uint64x1_t) (__a > __b);
++}
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrsqrte_f32 (float32x2_t a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcges_f32 (float32_t __a, float32_t __b)
++vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
  {
 -  float32x2_t result;
 -  __asm__ ("frsqrte %0.2s,%1.2s"
@@ -20728,21 +22415,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return __a >= __b ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcged_s64 (int64_t __a, int64_t __b)
-+{
-+  return __a >= __b ? -1ll : 0ll;
++  return (__a > __b);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrsqrte_f64 (float64x1_t a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcged_u64 (uint64_t __a, uint64_t __b)
++vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
  {
 -  float64x1_t result;
 -  __asm__ ("frsqrte %d0,%d1"
@@ -20750,14 +22430,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return __a >= __b ? -1ll : 0ll;
++  return (__a > __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vrsqrte_u32 (uint32x2_t a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcged_f64 (float64_t __a, float64_t __b)
++vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
  {
 -  uint32x2_t result;
 -  __asm__ ("ursqrte %0.2s,%1.2s"
@@ -20765,16 +22445,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return __a >= __b ? -1ll : 0ll;
++  return (__a > __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vrsqrted_f64 (float64_t a)
-+/* vcgez - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_f32 (float32x2_t __a)
++vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
  {
 -  float64_t result;
 -  __asm__ ("frsqrte %d0,%d1"
@@ -20782,21 +22460,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a >= 0.0f);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_f64 (float64x1_t __a)
-+{
-+  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
++  return (__a > __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrsqrteq_f32 (float32x4_t a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s8 (int8x8_t __a)
++vcgtq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  float32x4_t result;
 -  __asm__ ("frsqrte %0.4s,%1.4s"
@@ -20804,14 +22475,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x8_t) (__a >= 0);
++  return (uint32x4_t) (__a > __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrsqrteq_f64 (float64x2_t a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s16 (int16x4_t __a)
++vcgtq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  float64x2_t result;
 -  __asm__ ("frsqrte %0.2d,%1.2d"
@@ -20819,14 +22490,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x4_t) (__a >= 0);
++  return (uint64x2_t) (__a > __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vrsqrteq_u32 (uint32x4_t a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s32 (int32x2_t __a)
++vcgtq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  uint32x4_t result;
 -  __asm__ ("ursqrte %0.4s,%1.4s"
@@ -20834,14 +22505,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a >= 0);
++  return (uint8x16_t) (__a > __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vrsqrtes_f32 (float32_t a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s64 (int64x1_t __a)
++vcgtq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  float32_t result;
 -  __asm__ ("frsqrte %s0,%s1"
@@ -20849,14 +22520,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
++  return (uint16x8_t) (__a > __b);
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrsqrts_f32 (float32x2_t a, float32x2_t b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_f32 (float32x4_t __a)
++vcgtq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  float32x2_t result;
 -  __asm__ ("frsqrts %0.2s,%1.2s,%2.2s"
@@ -20864,14 +22535,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a >= 0.0f);
++  return (uint32x4_t) (__a > __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vrsqrtsd_f64 (float64_t a, float64_t b)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_f64 (float64x2_t __a)
++vcgtq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  float64_t result;
 -  __asm__ ("frsqrts %d0,%d1,%d2"
@@ -20879,14 +22550,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a >= 0.0);
++  return (uint64x2_t) (__a > __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s8 (int8x16_t __a)
++vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
  {
 -  float32x4_t result;
 -  __asm__ ("frsqrts %0.4s,%1.4s,%2.4s"
@@ -20894,14 +22565,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x16_t) (__a >= 0);
++  return (__a > __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s16 (int16x8_t __a)
++vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
  {
 -  float64x2_t result;
 -  __asm__ ("frsqrts %0.2d,%1.2d,%2.2d"
@@ -20909,14 +22580,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x8_t) (__a >= 0);
++  return (__a > __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vrsqrtss_f32 (float32_t a, float32_t b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s32 (int32x4_t __a)
++vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
  {
 -  float32_t result;
 -  __asm__ ("frsqrts %s0,%s1,%s2"
@@ -20924,7 +22595,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a >= 0);
++  return (__a > __b);
  }
  
 -#define vshrn_high_n_s16(a, b, c)                                       \
@@ -21016,7 +22687,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
++{
++  return (__a > __b);
++}
+ 
 -#define vshrn_n_s16(a, b)                                               \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -21028,7 +22705,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++/* vcgt - scalar.  */
+ 
 -#define vshrn_n_s32(a, b)                                               \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -21040,7 +22718,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgts_f32 (float32_t __a, float32_t __b)
++{
++  return __a > __b ? -1 : 0;
++}
+ 
 -#define vshrn_n_s64(a, b)                                               \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -21052,7 +22736,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgtd_s64 (int64_t __a, int64_t __b)
++{
++  return __a > __b ? -1ll : 0ll;
++}
+ 
 -#define vshrn_n_u16(a, b)                                               \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -21064,7 +22754,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgtd_u64 (uint64_t __a, uint64_t __b)
++{
++  return __a > __b ? -1ll : 0ll;
++}
+ 
 -#define vshrn_n_u32(a, b)                                               \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -21076,7 +22772,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
--
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgtd_f64 (float64_t __a, float64_t __b)
++{
++  return __a > __b ? -1ll : 0ll;
++}
+ 
 -#define vshrn_n_u64(a, b)                                               \
 -  __extension__                                                         \
 -    ({                                                                  \
@@ -21088,12 +22790,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s64 (int64x2_t __a)
-+{
-+  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
-+}
++/* vcgtz - vector.  */
  
 -#define vsli_n_p8(a, b, c)                                              \
 -  __extension__                                                         \
@@ -21107,7 +22804,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+/* vcgez - scalar.  */
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgtz_f32 (float32x2_t __a)
++{
++  return (uint32x2_t) (__a > 0.0f);
++}
  
 -#define vsli_n_p16(a, b, c)                                             \
 -  __extension__                                                         \
@@ -21121,11 +22823,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezs_f32 (float32_t __a)
++vcgtz_f64 (float64x1_t __a)
 +{
-+  return __a >= 0.0f ? -1 : 0;
++  return (uint64x1_t) (__a > (float64x1_t) {0.0});
 +}
  
 -#define vsliq_n_p8(a, b, c)                                             \
@@ -21140,11 +22842,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezd_s64 (int64_t __a)
++vcgtz_s8 (int8x8_t __a)
 +{
-+  return __a >= 0 ? -1ll : 0ll;
++  return (uint8x8_t) (__a > 0);
 +}
  
 -#define vsliq_n_p16(a, b, c)                                            \
@@ -21159,11 +22861,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezd_f64 (float64_t __a)
++vcgtz_s16 (int16x4_t __a)
 +{
-+  return __a >= 0.0 ? -1ll : 0ll;
++  return (uint16x4_t) (__a > 0);
 +}
  
 -#define vsri_n_p8(a, b, c)                                              \
@@ -21178,7 +22880,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+/* vcgt - vector.  */
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcgtz_s32 (int32x2_t __a)
++{
++  return (uint32x2_t) (__a > 0);
++}
  
 -#define vsri_n_p16(a, b, c)                                             \
 -  __extension__                                                         \
@@ -21192,11 +22899,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_f32 (float32x2_t __a, float32x2_t __b)
++vcgtz_s64 (int64x1_t __a)
 +{
-+  return (uint32x2_t) (__a > __b);
++  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
 +}
  
 -#define vsriq_n_p8(a, b, c)                                             \
@@ -21211,11 +22918,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_f64 (float64x1_t __a, float64x1_t __b)
++vcgtzq_f32 (float32x4_t __a)
 +{
-+  return (uint64x1_t) (__a > __b);
++  return (uint32x4_t) (__a > 0.0f);
 +}
  
 -#define vsriq_n_p16(a, b, c)                                            \
@@ -21230,18 +22937,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -                : /* No clobbers */);                                   \
 -       result;                                                          \
 -     })
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s8 (int8x8_t __a, int8x8_t __b)
++vcgtzq_f64 (float64x2_t __a)
 +{
-+  return (uint8x8_t) (__a > __b);
++    return (uint64x2_t) (__a > 0.0);
 +}
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vtst_p8 (poly8x8_t a, poly8x8_t b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s16 (int16x4_t __a, int16x4_t __b)
++vcgtzq_s8 (int8x16_t __a)
  {
 -  uint8x8_t result;
 -  __asm__ ("cmtst %0.8b, %1.8b, %2.8b"
@@ -21249,14 +22956,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x4_t) (__a > __b);
++  return (uint8x16_t) (__a > 0);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vtst_p16 (poly16x4_t a, poly16x4_t b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s32 (int32x2_t __a, int32x2_t __b)
++vcgtzq_s16 (int16x8_t __a)
  {
 -  uint16x4_t result;
 -  __asm__ ("cmtst %0.4h, %1.4h, %2.4h"
@@ -21264,14 +22971,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a > __b);
++  return (uint16x8_t) (__a > 0);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vtstq_p8 (poly8x16_t a, poly8x16_t b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s64 (int64x1_t __a, int64x1_t __b)
++vcgtzq_s32 (int32x4_t __a)
  {
 -  uint8x16_t result;
 -  __asm__ ("cmtst %0.16b, %1.16b, %2.16b"
@@ -21279,14 +22986,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a > __b);
++  return (uint32x4_t) (__a > 0);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vtstq_p16 (poly16x8_t a, poly16x8_t b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
++vcgtzq_s64 (int64x2_t __a)
  {
 -  uint16x8_t result;
 -  __asm__ ("cmtst %0.8h, %1.8h, %2.8h"
@@ -21294,39 +23001,34 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a > __b);
++  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
  }
  
 -/* End of temporary inline asm implementations.  */
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
-+{
-+  return (__a > __b);
-+}
++/* vcgtz - scalar.  */
  
 -/* Start of temporary inline asm for vldn, vstn and friends.  */
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
++vcgtzs_f32 (float32_t __a)
 +{
-+  return (__a > __b);
++  return __a > 0.0f ? -1 : 0;
 +}
  
 -/* Create struct element types for duplicating loads.
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
++vcgtzd_s64 (int64_t __a)
 +{
-+  return (__a > __b);
++  return __a > 0 ? -1ll : 0ll;
 +}
  
 -   Create 2 element structures of:
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_f32 (float32x4_t __a, float32x4_t __b)
++vcgtzd_f64 (float64_t __a)
 +{
-+  return (uint32x4_t) (__a > __b);
++  return __a > 0.0 ? -1ll : 0ll;
 +}
  
 -   +------+----+----+----+----+
@@ -21340,19 +23042,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -   +------+----+----+----+----+
 -   |poly  | Y  | Y  | -  | -  |
 -   +------+----+----+----+----+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return (uint64x2_t) (__a > __b);
-+}
++/* vcle - vector.  */
  
 -   Create 3 element structures of:
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s8 (int8x16_t __a, int8x16_t __b)
++vcle_f32 (float32x2_t __a, float32x2_t __b)
 +{
-+  return (uint8x16_t) (__a > __b);
++  return (uint32x2_t) (__a <= __b);
 +}
  
 -   +------+----+----+----+----+
@@ -21366,19 +23063,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -   +------+----+----+----+----+
 -   |poly  | Y  | Y  | -  | -  |
 -   +------+----+----+----+----+
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s16 (int16x8_t __a, int16x8_t __b)
++vcle_f64 (float64x1_t __a, float64x1_t __b)
 +{
-+  return (uint16x8_t) (__a > __b);
++  return (uint64x1_t) (__a <= __b);
 +}
  
 -   Create 4 element structures of:
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s32 (int32x4_t __a, int32x4_t __b)
++vcle_s8 (int8x8_t __a, int8x8_t __b)
 +{
-+  return (uint32x4_t) (__a > __b);
++  return (uint8x8_t) (__a <= __b);
 +}
  
 -   +------+----+----+----+----+
@@ -21392,11 +23089,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -   +------+----+----+----+----+
 -   |poly  | Y  | N  | -  | -  |
 -   +------+----+----+----+----+
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s64 (int64x2_t __a, int64x2_t __b)
++vcle_s16 (int16x4_t __a, int16x4_t __b)
 +{
-+  return (uint64x2_t) (__a > __b);
++  return (uint16x4_t) (__a <= __b);
 +}
  
 -  This is required for casting memory reference.  */
@@ -21404,11 +23101,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  typedef struct t ## sz ## x ## nelem ## _t {	\
 -    t ## sz ## _t val[nelem];			\
 -  }  t ## sz ## x ## nelem ## _t;
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
++vcle_s32 (int32x2_t __a, int32x2_t __b)
 +{
-+  return (__a > __b);
++  return (uint32x2_t) (__a <= __b);
 +}
  
 -/* 2-element structs.  */
@@ -21441,29 +23138,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__STRUCTN (poly, 8, 4)
 -__STRUCTN (float, 64, 4)
 -#undef __STRUCTN
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
-+{
-+  return (__a > __b);
-+}
-+
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
++vcle_s64 (int64x1_t __a, int64x1_t __b)
 +{
-+  return (__a > __b);
++  return (uint64x1_t) (__a <= __b);
 +}
-+
-+__extension__ extern __inline uint64x2_t
+ 
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
++vcle_u8 (uint8x8_t __a, uint8x8_t __b)
 +{
-+  return (__a > __b);
++  return (__a <= __b);
 +}
  
-+/* vcgt - scalar.  */
- 
 -#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
 -			qmode, ptr_mode, funcsuffix, signedtype)	     \
 -__extension__ static __inline void					     \
@@ -21485,11 +23173,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					     (signedtype) __temp.val[1], 1); \
 -  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
 -				     __ptr, __o, __c);			     \
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgts_f32 (float32_t __a, float32_t __b)
++vcle_u16 (uint16x4_t __a, uint16x4_t __b)
 +{
-+  return __a > __b ? -1 : 0;
++  return (__a <= __b);
  }
  
 -__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
@@ -21518,11 +23206,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		 int32x4_t)
 -__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
 -		 int64x2_t)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtd_s64 (int64_t __a, int64_t __b)
++vcle_u32 (uint32x2_t __a, uint32x2_t __b)
 +{
-+  return __a > __b ? -1ll : 0ll;
++  return (__a <= __b);
 +}
  
 -#undef __ST2_LANE_FUNC
@@ -21536,11 +23224,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
 -  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
 -				    __ptr, __temp.__o, __c);		    \
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtd_u64 (uint64_t __a, uint64_t __b)
++vcle_u64 (uint64x1_t __a, uint64x1_t __b)
 +{
-+  return __a > __b ? -1ll : 0ll;
++  return (__a <= __b);
  }
  
 -__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
@@ -21556,11 +23244,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
 -__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
 -__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtd_f64 (float64_t __a, float64_t __b)
++vcleq_f32 (float32x4_t __a, float32x4_t __b)
 +{
-+  return __a > __b ? -1ll : 0ll;
++  return (uint32x4_t) (__a <= __b);
 +}
  
 -#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
@@ -21589,13 +23277,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					     (signedtype) __temp.val[2], 2); \
 -  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
 -				     __ptr, __o, __c);			     \
-+/* vcgtz - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_f32 (float32x2_t __a)
++vcleq_f64 (float64x2_t __a, float64x2_t __b)
 +{
-+  return (uint32x2_t) (__a > 0.0f);
++  return (uint64x2_t) (__a <= __b);
  }
  
 -__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
@@ -21624,11 +23310,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		 int32x4_t)
 -__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
 -		 int64x2_t)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_f64 (float64x1_t __a)
++vcleq_s8 (int8x16_t __a, int8x16_t __b)
 +{
-+  return (uint64x1_t) (__a > (float64x1_t) {0.0});
++  return (uint8x16_t) (__a <= __b);
 +}
  
 -#undef __ST3_LANE_FUNC
@@ -21642,11 +23328,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
 -  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
 -				    __ptr, __temp.__o, __c);		    \
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s8 (int8x8_t __a)
++vcleq_s16 (int16x8_t __a, int16x8_t __b)
 +{
-+  return (uint8x8_t) (__a > 0);
++  return (uint16x8_t) (__a <= __b);
  }
  
 -__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
@@ -21662,11 +23348,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
 -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
 -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s16 (int16x4_t __a)
++vcleq_s32 (int32x4_t __a, int32x4_t __b)
 +{
-+  return (uint16x4_t) (__a > 0);
++  return (uint32x4_t) (__a <= __b);
 +}
  
 -#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
@@ -21700,11 +23386,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					     (signedtype) __temp.val[3], 3); \
 -  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
 -				     __ptr, __o, __c);			     \
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s32 (int32x2_t __a)
++vcleq_s64 (int64x2_t __a, int64x2_t __b)
 +{
-+  return (uint32x2_t) (__a > 0);
++  return (uint64x2_t) (__a <= __b);
  }
  
 -__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
@@ -21733,46 +23419,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		 int32x4_t)
 -__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
 -		 int64x2_t)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s64 (int64x1_t __a)
-+{
-+  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_f32 (float32x4_t __a)
-+{
-+  return (uint32x4_t) (__a > 0.0f);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_f64 (float64x2_t __a)
-+{
-+    return (uint64x2_t) (__a > 0.0);
-+}
-+
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s8 (int8x16_t __a)
-+{
-+  return (uint8x16_t) (__a > 0);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s16 (int16x8_t __a)
-+{
-+  return (uint16x8_t) (__a > 0);
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s32 (int32x4_t __a)
++vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
 +{
-+  return (uint32x4_t) (__a > 0);
++  return (__a <= __b);
 +}
  
 -#undef __ST4_LANE_FUNC
@@ -21786,11 +23437,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
 -  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
 -				    __ptr, __temp.__o, __c);		    \
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
++{
++  return (__a <= __b);
++}
++
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
++{
++  return (__a <= __b);
++}
++
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s64 (int64x2_t __a)
++vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
 +{
-+  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
++  return (__a <= __b);
  }
  
 -__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
@@ -21806,127 +23471,121 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
 -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
 -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
-+/* vcgtz - scalar.  */
++/* vcle - scalar.  */
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vaddlv_s32 (int32x2_t a)
 +__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzs_f32 (float32_t __a)
++vcles_f32 (float32_t __a, float32_t __b)
  {
 -  int64_t result;
 -  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
 -  return result;
-+  return __a > 0.0f ? -1 : 0;
++  return __a <= __b ? -1 : 0;
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vaddlv_u32 (uint32x2_t a)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzd_s64 (int64_t __a)
++vcled_s64 (int64_t __a, int64_t __b)
  {
 -  uint64_t result;
 -  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
 -  return result;
-+  return __a > 0 ? -1ll : 0ll;
++  return __a <= __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzd_f64 (float64_t __a)
++vcled_u64 (uint64_t __a, uint64_t __b)
  {
 -  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
-+  return __a > 0.0 ? -1ll : 0ll;
++  return __a <= __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
-+/* vcle - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_f32 (float32x2_t __a, float32x2_t __b)
++vcled_f64 (float64_t __a, float64_t __b)
  {
 -  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
-+  return (uint32x2_t) (__a <= __b);
++  return __a <= __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint64x1_t
++/* vclez - vector.  */
++
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_f64 (float64x1_t __a, float64x1_t __b)
++vclez_f32 (float32x2_t __a)
  {
 -  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
-+  return (uint64x1_t) (__a <= __b);
++  return (uint32x2_t) (__a <= 0.0f);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s8 (int8x8_t __a, int8x8_t __b)
++vclez_f64 (float64x1_t __a)
  {
 -  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
-+  return (uint8x8_t) (__a <= __b);
++  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s16 (int16x4_t __a, int16x4_t __b)
++vclez_s8 (int8x8_t __a)
  {
 -  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
-+  return (uint16x4_t) (__a <= __b);
++  return (uint8x8_t) (__a <= 0);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s32 (int32x2_t __a, int32x2_t __b)
++vclez_s16 (int16x4_t __a)
  {
 -  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
-+  return (uint32x2_t) (__a <= __b);
++  return (uint16x4_t) (__a <= 0);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s64 (int64x1_t __a, int64x1_t __b)
++vclez_s32 (int32x2_t __a)
  {
 -  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
-+  return (uint64x1_t) (__a <= __b);
++  return (uint32x2_t) (__a <= 0);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u8 (uint8x8_t __a, uint8x8_t __b)
++vclez_s64 (int64x1_t __a)
  {
 -  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
-+  return (__a <= __b);
++  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
  }
  
 -/* Table intrinsics.  */
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u16 (uint16x4_t __a, uint16x4_t __b)
-+{
-+  return (__a <= __b);
-+}
- 
+-
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u32 (uint32x2_t __a, uint32x2_t __b)
++vclezq_f32 (float32x4_t __a)
  {
 -  poly8x8_t result;
 -  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
@@ -21934,14 +23593,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a <= __b);
++  return (uint32x4_t) (__a <= 0.0f);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vqtbl1_s8 (int8x16_t a, uint8x8_t b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u64 (uint64x1_t __a, uint64x1_t __b)
++vclezq_f64 (float64x2_t __a)
  {
 -  int8x8_t result;
 -  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
@@ -21949,14 +23608,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a <= __b);
++  return (uint64x2_t) (__a <= 0.0);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_f32 (float32x4_t __a, float32x4_t __b)
++vclezq_s8 (int8x16_t __a)
  {
 -  uint8x8_t result;
 -  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
@@ -21964,14 +23623,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a <= __b);
++  return (uint8x16_t) (__a <= 0);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_f64 (float64x2_t __a, float64x2_t __b)
++vclezq_s16 (int16x8_t __a)
  {
 -  poly8x16_t result;
 -  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
@@ -21979,14 +23638,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a <= __b);
++  return (uint16x8_t) (__a <= 0);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s8 (int8x16_t __a, int8x16_t __b)
++vclezq_s32 (int32x4_t __a)
  {
 -  int8x16_t result;
 -  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
@@ -21994,14 +23653,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint8x16_t) (__a <= __b);
++  return (uint32x4_t) (__a <= 0);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s16 (int16x8_t __a, int16x8_t __b)
++vclezq_s64 (int64x2_t __a)
  {
 -  uint8x16_t result;
 -  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
@@ -22009,14 +23668,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x8_t) (__a <= __b);
++  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++/* vclez - scalar.  */
++
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s32 (int32x4_t __a, int32x4_t __b)
++vclezs_f32 (float32_t __a)
  {
 -  int8x8_t result = r;
 -  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
@@ -22024,14 +23685,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(tab), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a <= __b);
++  return __a <= 0.0f ? -1 : 0;
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s64 (int64x2_t __a, int64x2_t __b)
++vclezd_s64 (int64_t __a)
  {
 -  uint8x8_t result = r;
 -  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
@@ -22039,14 +23700,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(tab), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a <= __b);
++  return __a <= 0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
++vclezd_f64 (float64_t __a)
  {
 -  poly8x8_t result = r;
 -  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
@@ -22054,14 +23715,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(tab), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a <= __b);
++  return __a <= 0.0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint16x8_t
++/* vclt - vector.  */
++
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
++vclt_f32 (float32x2_t __a, float32x2_t __b)
  {
 -  int8x16_t result = r;
 -  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
@@ -22069,14 +23732,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(tab), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a <= __b);
++  return (uint32x2_t) (__a < __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
++vclt_f64 (float64x1_t __a, float64x1_t __b)
  {
 -  uint8x16_t result = r;
 -  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
@@ -22084,14 +23747,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(tab), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a <= __b);
++  return (uint64x1_t) (__a < __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
++vclt_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  poly8x16_t result = r;
 -  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
@@ -22099,17 +23762,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(tab), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (__a <= __b);
++  return (uint8x8_t) (__a < __b);
  }
  
 -/* V7 legacy table intrinsics.  */
-+/* vcle - scalar.  */
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vclt_s16 (int16x4_t __a, int16x4_t __b)
++{
++  return (uint16x4_t) (__a < __b);
++}
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vtbl1_s8 (int8x8_t tab, int8x8_t idx)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcles_f32 (float32_t __a, float32_t __b)
++vclt_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  int8x8_t result;
 -  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
@@ -22118,14 +23786,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return __a <= __b ? -1 : 0;
++  return (uint32x2_t) (__a < __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcled_s64 (int64_t __a, int64_t __b)
++vclt_s64 (int64x1_t __a, int64x1_t __b)
  {
 -  uint8x8_t result;
 -  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
@@ -22134,14 +23802,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return __a <= __b ? -1ll : 0ll;
++  return (uint64x1_t) (__a < __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcled_u64 (uint64_t __a, uint64_t __b)
++vclt_u8 (uint8x8_t __a, uint8x8_t __b)
  {
 -  poly8x8_t result;
 -  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
@@ -22150,14 +23818,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return __a <= __b ? -1ll : 0ll;
++  return (__a < __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcled_f64 (float64_t __a, float64_t __b)
++vclt_u16 (uint16x4_t __a, uint16x4_t __b)
  {
 -  int8x8_t result;
 -  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
@@ -22166,16 +23834,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return __a <= __b ? -1ll : 0ll;
++  return (__a < __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
-+/* vclez - vector.  */
-+
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_f32 (float32x2_t __a)
++vclt_u32 (uint32x2_t __a, uint32x2_t __b)
  {
 -  uint8x8_t result;
 -  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
@@ -22184,14 +23850,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x2_t) (__a <= 0.0f);
++  return (__a < __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_f64 (float64x1_t __a)
++vclt_u64 (uint64x1_t __a, uint64x1_t __b)
  {
 -  poly8x8_t result;
 -  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
@@ -22200,21 +23866,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s8 (int8x8_t __a)
-+{
-+  return (uint8x8_t) (__a <= 0);
++  return (__a < __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s16 (int16x4_t __a)
++vcltq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  int8x8_t result;
 -  int8x16x2_t temp;
@@ -22227,14 +23886,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					   (int8x16_t) temp.val[1], 1);
 -  result = __builtin_aarch64_tbl3v8qi (__o, idx);
 -  return result;
-+  return (uint16x4_t) (__a <= 0);
++  return (uint32x4_t) (__a < __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s32 (int32x2_t __a)
++vcltq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  uint8x8_t result;
 -  uint8x16x2_t temp;
@@ -22247,14 +23906,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					   (int8x16_t) temp.val[1], 1);
 -  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 -  return result;
-+  return (uint32x2_t) (__a <= 0);
++  return (uint64x2_t) (__a < __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s64 (int64x1_t __a)
++vcltq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  poly8x8_t result;
 -  poly8x16x2_t temp;
@@ -22267,14 +23926,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					   (int8x16_t) temp.val[1], 1);
 -  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 -  return result;
-+  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
++  return (uint8x16_t) (__a < __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_f32 (float32x4_t __a)
++vcltq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  int8x8_t result;
 -  int8x16x2_t temp;
@@ -22287,14 +23946,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					   (int8x16_t) temp.val[1], 1);
 -  result = __builtin_aarch64_tbl3v8qi (__o, idx);
 -  return result;
-+  return (uint32x4_t) (__a <= 0.0f);
++  return (uint16x8_t) (__a < __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_f64 (float64x2_t __a)
++vcltq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  uint8x8_t result;
 -  uint8x16x2_t temp;
@@ -22307,14 +23966,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					   (int8x16_t) temp.val[1], 1);
 -  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 -  return result;
-+  return (uint64x2_t) (__a <= 0.0);
++  return (uint32x4_t) (__a < __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s8 (int8x16_t __a)
++vcltq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  poly8x8_t result;
 -  poly8x16x2_t temp;
@@ -22327,14 +23986,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -					   (int8x16_t) temp.val[1], 1);
 -  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
 -  return result;
-+  return (uint8x16_t) (__a <= 0);
++  return (uint64x2_t) (__a < __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s16 (int16x8_t __a)
++vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
  {
 -  int8x8_t result = r;
 -  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
@@ -22343,14 +24002,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint16x8_t) (__a <= 0);
++  return (__a < __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s32 (int32x4_t __a)
++vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
  {
 -  uint8x8_t result = r;
 -  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
@@ -22359,14 +24018,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint32x4_t) (__a <= 0);
++  return (__a < __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s64 (int64x2_t __a)
++vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
  {
 -  poly8x8_t result = r;
 -  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
@@ -22375,723 +24034,806 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -           : "w"(temp), "w"(idx)
 -           : /* No clobbers */);
 -  return result;
-+  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
++  return (__a < __b);
  }
  
 -/* End of temporary inline asm.  */
--
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
++{
++  return (__a < __b);
++}
+ 
 -/* Start of optimal implementations in approved order.  */
--
++/* vclt - scalar.  */
+ 
 -/* vabs  */
-+/* vclez - scalar.  */
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vclts_f32 (float32_t __a, float32_t __b)
++{
++  return __a < __b ? -1 : 0;
++}
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vabs_f32 (float32x2_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezs_f32 (float32_t __a)
++vcltd_s64 (int64_t __a, int64_t __b)
  {
 -  return __builtin_aarch64_absv2sf (__a);
-+  return __a <= 0.0f ? -1 : 0;
++  return __a < __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vabs_f64 (float64x1_t __a)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezd_s64 (int64_t __a)
++vcltd_u64 (uint64_t __a, uint64_t __b)
  {
 -  return (float64x1_t) {__builtin_fabs (__a[0])};
-+  return __a <= 0 ? -1ll : 0ll;
++  return __a < __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vabs_s8 (int8x8_t __a)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezd_f64 (float64_t __a)
++vcltd_f64 (float64_t __a, float64_t __b)
  {
 -  return __builtin_aarch64_absv8qi (__a);
-+  return __a <= 0.0 ? -1ll : 0ll;
++  return __a < __b ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vabs_s16 (int16x4_t __a)
-+/* vclt - vector.  */
++/* vcltz - vector.  */
 +
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_f32 (float32x2_t __a, float32x2_t __b)
++vcltz_f32 (float32x2_t __a)
  {
 -  return __builtin_aarch64_absv4hi (__a);
-+  return (uint32x2_t) (__a < __b);
++  return (uint32x2_t) (__a < 0.0f);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vabs_s32 (int32x2_t __a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_f64 (float64x1_t __a, float64x1_t __b)
++vcltz_f64 (float64x1_t __a)
  {
 -  return __builtin_aarch64_absv2si (__a);
-+  return (uint64x1_t) (__a < __b);
++  return (uint64x1_t) (__a < (float64x1_t) {0.0});
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vabs_s64 (int64x1_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s8 (int8x8_t __a, int8x8_t __b)
++vcltz_s8 (int8x8_t __a)
  {
 -  return (int64x1_t) {__builtin_aarch64_absdi (__a[0])};
-+  return (uint8x8_t) (__a < __b);
++  return (uint8x8_t) (__a < 0);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vabsq_f32 (float32x4_t __a)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s16 (int16x4_t __a, int16x4_t __b)
++vcltz_s16 (int16x4_t __a)
  {
 -  return __builtin_aarch64_absv4sf (__a);
-+  return (uint16x4_t) (__a < __b);
++  return (uint16x4_t) (__a < 0);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vabsq_f64 (float64x2_t __a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s32 (int32x2_t __a, int32x2_t __b)
++vcltz_s32 (int32x2_t __a)
  {
 -  return __builtin_aarch64_absv2df (__a);
-+  return (uint32x2_t) (__a < __b);
++  return (uint32x2_t) (__a < 0);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vabsq_s8 (int8x16_t __a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s64 (int64x1_t __a, int64x1_t __b)
++vcltz_s64 (int64x1_t __a)
  {
 -  return __builtin_aarch64_absv16qi (__a);
-+  return (uint64x1_t) (__a < __b);
++  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vabsq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u8 (uint8x8_t __a, uint8x8_t __b)
++vcltzq_f32 (float32x4_t __a)
  {
 -  return __builtin_aarch64_absv8hi (__a);
-+  return (__a < __b);
++  return (uint32x4_t) (__a < 0.0f);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vabsq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u16 (uint16x4_t __a, uint16x4_t __b)
++vcltzq_f64 (float64x2_t __a)
  {
 -  return __builtin_aarch64_absv4si (__a);
-+  return (__a < __b);
++  return (uint64x2_t) (__a < 0.0);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vabsq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u32 (uint32x2_t __a, uint32x2_t __b)
++vcltzq_s8 (int8x16_t __a)
  {
 -  return __builtin_aarch64_absv2di (__a);
-+  return (__a < __b);
++  return (uint8x16_t) (__a < 0);
  }
  
 -/* vadd */
--
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcltzq_s16 (int16x8_t __a)
++{
++  return (uint16x8_t) (__a < 0);
++}
+ 
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vaddd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u64 (uint64x1_t __a, uint64x1_t __b)
++vcltzq_s32 (int32x4_t __a)
  {
 -  return __a + __b;
-+  return (__a < __b);
++  return (uint32x4_t) (__a < 0);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vaddd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_f32 (float32x4_t __a, float32x4_t __b)
++vcltzq_s64 (int64x2_t __a)
  {
 -  return __a + __b;
-+  return (uint32x4_t) (__a < __b);
++  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
  }
  
 -/* vaddv */
--
++/* vcltz - scalar.  */
+ 
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vaddv_s8 (int8x8_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_f64 (float64x2_t __a, float64x2_t __b)
++vcltzs_f32 (float32_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
-+  return (uint64x2_t) (__a < __b);
++  return __a < 0.0f ? -1 : 0;
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vaddv_s16 (int16x4_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s8 (int8x16_t __a, int8x16_t __b)
++vcltzd_s64 (int64_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
-+  return (uint8x16_t) (__a < __b);
++  return __a < 0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vaddv_s32 (int32x2_t __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s16 (int16x8_t __a, int16x8_t __b)
++vcltzd_f64 (float64_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v2si (__a);
-+  return (uint16x8_t) (__a < __b);
++  return __a < 0.0 ? -1ll : 0ll;
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vaddv_u8 (uint8x8_t __a)
-+__extension__ extern __inline uint32x4_t
++/* vcls.  */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s32 (int32x4_t __a, int32x4_t __b)
++vcls_s8 (int8x8_t __a)
  {
 -  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
-+  return (uint32x4_t) (__a < __b);
++  return __builtin_aarch64_clrsbv8qi (__a);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vaddv_u16 (uint16x4_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s64 (int64x2_t __a, int64x2_t __b)
++vcls_s16 (int16x4_t __a)
  {
 -  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
-+  return (uint64x2_t) (__a < __b);
++  return __builtin_aarch64_clrsbv4hi (__a);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vaddv_u32 (uint32x2_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
++vcls_s32 (int32x2_t __a)
  {
 -  return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
-+  return (__a < __b);
++  return __builtin_aarch64_clrsbv2si (__a);
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vaddvq_s8 (int8x16_t __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
++vclsq_s8 (int8x16_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
-+  return (__a < __b);
++  return __builtin_aarch64_clrsbv16qi (__a);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vaddvq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
++vclsq_s16 (int16x8_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
-+  return (__a < __b);
++  return __builtin_aarch64_clrsbv8hi (__a);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vaddvq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
++vclsq_s32 (int32x4_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v4si (__a);
-+  return (__a < __b);
++  return __builtin_aarch64_clrsbv4si (__a);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vaddvq_s64 (int64x2_t __a)
-+/* vclt - scalar.  */
++/* vclz.  */
 +
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclts_f32 (float32_t __a, float32_t __b)
++vclz_s8 (int8x8_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v2di (__a);
-+  return __a < __b ? -1 : 0;
++  return __builtin_aarch64_clzv8qi (__a);
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vaddvq_u8 (uint8x16_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltd_s64 (int64_t __a, int64_t __b)
++vclz_s16 (int16x4_t __a)
  {
 -  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
-+  return __a < __b ? -1ll : 0ll;
++  return __builtin_aarch64_clzv4hi (__a);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vaddvq_u16 (uint16x8_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltd_u64 (uint64_t __a, uint64_t __b)
++vclz_s32 (int32x2_t __a)
  {
 -  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
-+  return __a < __b ? -1ll : 0ll;
++  return __builtin_aarch64_clzv2si (__a);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vaddvq_u32 (uint32x4_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltd_f64 (float64_t __a, float64_t __b)
++vclz_u8 (uint8x8_t __a)
  {
 -  return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
-+  return __a < __b ? -1ll : 0ll;
++  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vaddvq_u64 (uint64x2_t __a)
-+/* vcltz - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_f32 (float32x2_t __a)
++vclz_u16 (uint16x4_t __a)
  {
 -  return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
-+  return (uint32x2_t) (__a < 0.0f);
++  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vaddv_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_f64 (float64x1_t __a)
++vclz_u32 (uint32x2_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
-+  return (uint64x1_t) (__a < (float64x1_t) {0.0});
++  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vaddvq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s8 (int8x8_t __a)
++vclzq_s8 (int8x16_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
-+  return (uint8x8_t) (__a < 0);
++  return __builtin_aarch64_clzv16qi (__a);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vaddvq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s16 (int16x4_t __a)
++vclzq_s16 (int16x8_t __a)
  {
 -  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
-+  return (uint16x4_t) (__a < 0);
++  return __builtin_aarch64_clzv8hi (__a);
  }
  
 -/* vbsl  */
--
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vclzq_s32 (int32x4_t __a)
++{
++  return __builtin_aarch64_clzv4si (__a);
++}
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s32 (int32x2_t __a)
++vclzq_u8 (uint8x16_t __a)
  {
 -  return __builtin_aarch64_simd_bslv2sf_suss (__a, __b, __c);
-+  return (uint32x2_t) (__a < 0);
++  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vbsl_f64 (uint64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s64 (int64x1_t __a)
++vclzq_u16 (uint16x8_t __a)
  {
 -  return (float64x1_t)
 -    { __builtin_aarch64_simd_bsldf_suss (__a[0], __b[0], __c[0]) };
-+  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
++  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_f32 (float32x4_t __a)
++vclzq_u32 (uint32x4_t __a)
  {
 -  return __builtin_aarch64_simd_bslv8qi_pupp (__a, __b, __c);
-+  return (uint32x4_t) (__a < 0.0f);
++  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
  }
  
 -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 -vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
-+__extension__ extern __inline uint64x2_t
++/* vcnt.  */
++
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_f64 (float64x2_t __a)
++vcnt_p8 (poly8x8_t __a)
  {
 -  return __builtin_aarch64_simd_bslv4hi_pupp (__a, __b, __c);
-+  return (uint64x2_t) (__a < 0.0);
++  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s8 (int8x16_t __a)
++vcnt_s8 (int8x8_t __a)
  {
 -  return __builtin_aarch64_simd_bslv8qi_suss (__a, __b, __c);
-+  return (uint8x16_t) (__a < 0);
++  return __builtin_aarch64_popcountv8qi (__a);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s16 (int16x8_t __a)
++vcnt_u8 (uint8x8_t __a)
  {
 -  return __builtin_aarch64_simd_bslv4hi_suss (__a, __b, __c);
-+  return (uint16x8_t) (__a < 0);
++  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s32 (int32x4_t __a)
++vcntq_p8 (poly8x16_t __a)
  {
 -  return __builtin_aarch64_simd_bslv2si_suss (__a, __b, __c);
-+  return (uint32x4_t) (__a < 0);
++  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s64 (int64x2_t __a)
++vcntq_s8 (int8x16_t __a)
  {
 -  return (int64x1_t)
 -      {__builtin_aarch64_simd_bsldi_suss (__a[0], __b[0], __c[0])};
-+  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
++  return __builtin_aarch64_popcountv16qi (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-+/* vcltz - scalar.  */
-+
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzs_f32 (float32_t __a)
++vcntq_u8 (uint8x16_t __a)
  {
 -  return __builtin_aarch64_simd_bslv8qi_uuuu (__a, __b, __c);
-+  return __a < 0.0f ? -1 : 0;
++  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-+__extension__ extern __inline uint64_t
++/* vcopy_lane.  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzd_s64 (int64_t __a)
++vcopy_lane_f32 (float32x2_t __a, const int __lane1,
++		float32x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv4hi_uuuu (__a, __b, __c);
-+  return __a < 0 ? -1ll : 0ll;
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzd_f64 (float64_t __a)
++vcopy_lane_f64 (float64x1_t __a, const int __lane1,
++		float64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv2si_uuuu (__a, __b, __c);
-+  return __a < 0.0 ? -1ll : 0ll;
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
-+/* vcls.  */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcls_s8 (int8x8_t __a)
++vcopy_lane_p8 (poly8x8_t __a, const int __lane1,
++	       poly8x8_t __b, const int __lane2)
  {
 -  return (uint64x1_t)
 -      {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
-+  return __builtin_aarch64_clrsbv8qi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				 __a, __lane1);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcls_s16 (int16x4_t __a)
++vcopy_lane_p16 (poly16x4_t __a, const int __lane1,
++		poly16x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv4sf_suss (__a, __b, __c);
-+  return __builtin_aarch64_clrsbv4hi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vbslq_f64 (uint64x2_t __a, float64x2_t __b, float64x2_t __c)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcls_s32 (int32x2_t __a)
++vcopy_lane_p64 (poly64x1_t __a, const int __lane1,
++		poly64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv2df_suss (__a, __b, __c);
-+  return __builtin_aarch64_clrsbv2si (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclsq_s8 (int8x16_t __a)
++vcopy_lane_s8 (int8x8_t __a, const int __lane1,
++	       int8x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv16qi_pupp (__a, __b, __c);
-+  return __builtin_aarch64_clrsbv16qi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				 __a, __lane1);
  }
  
 -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 -vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclsq_s16 (int16x8_t __a)
++vcopy_lane_s16 (int16x4_t __a, const int __lane1,
++		int16x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv8hi_pupp (__a, __b, __c);
-+  return __builtin_aarch64_clrsbv8hi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclsq_s32 (int32x4_t __a)
++vcopy_lane_s32 (int32x2_t __a, const int __lane1,
++		int32x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv16qi_suss (__a, __b, __c);
-+  return __builtin_aarch64_clrsbv4si (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
-+/* vclz.  */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_s8 (int8x8_t __a)
++vcopy_lane_s64 (int64x1_t __a, const int __lane1,
++		int64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv8hi_suss (__a, __b, __c);
-+  return __builtin_aarch64_clzv8qi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_s16 (int16x4_t __a)
++vcopy_lane_u8 (uint8x8_t __a, const int __lane1,
++	       uint8x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv4si_suss (__a, __b, __c);
-+  return __builtin_aarch64_clzv4hi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				 __a, __lane1);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_s32 (int32x2_t __a)
++vcopy_lane_u16 (uint16x4_t __a, const int __lane1,
++		uint16x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv2di_suss (__a, __b, __c);
-+  return __builtin_aarch64_clzv2si (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_u8 (uint8x8_t __a)
++vcopy_lane_u32 (uint32x2_t __a, const int __lane1,
++		uint32x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv16qi_uuuu (__a, __b, __c);
-+  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_u16 (uint16x4_t __a)
++vcopy_lane_u64 (uint64x1_t __a, const int __lane1,
++		uint64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv8hi_uuuu (__a, __b, __c);
-+  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-+__extension__ extern __inline uint32x2_t
++/* vcopy_laneq.  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_u32 (uint32x2_t __a)
++vcopy_laneq_f32 (float32x2_t __a, const int __lane1,
++		 float32x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv4si_uuuu (__a, __b, __c);
-+  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_s8 (int8x16_t __a)
++vcopy_laneq_f64 (float64x1_t __a, const int __lane1,
++		 float64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_simd_bslv2di_uuuu (__a, __b, __c);
-+  return __builtin_aarch64_clzv16qi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -/* ARMv8.1 instrinsics.  */
 -#pragma GCC push_options
 -#pragma GCC target ("arch=armv8.1-a")
--
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcopy_laneq_p8 (poly8x8_t __a, const int __lane1,
++		poly8x16_t __b, const int __lane2)
++{
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				 __a, __lane1);
++}
+ 
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_s16 (int16x8_t __a)
++vcopy_laneq_p16 (poly16x4_t __a, const int __lane1,
++		 poly16x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
-+  return __builtin_aarch64_clzv8hi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_s32 (int32x4_t __a)
++vcopy_laneq_p64 (poly64x1_t __a, const int __lane1,
++		 poly64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlahv2si (__a, __b, __c);
-+  return __builtin_aarch64_clzv4si (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_u8 (uint8x16_t __a)
++vcopy_laneq_s8 (int8x8_t __a, const int __lane1,
++		int8x16_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c);
-+  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				 __a, __lane1);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_u16 (uint16x8_t __a)
++vcopy_laneq_s16 (int16x4_t __a, const int __lane1,
++		 int16x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlahv4si (__a, __b, __c);
-+  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_u32 (uint32x4_t __a)
++vcopy_laneq_s32 (int32x2_t __a, const int __lane1,
++		 int32x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c);
-+  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-+/* vcnt.  */
-+
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcnt_p8 (poly8x8_t __a)
++vcopy_laneq_s64 (int64x1_t __a, const int __lane1,
++		 int64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlshv2si (__a, __b, __c);
-+  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcnt_s8 (int8x8_t __a)
++vcopy_laneq_u8 (uint8x8_t __a, const int __lane1,
++		uint8x16_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c);
-+  return __builtin_aarch64_popcountv8qi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				 __a, __lane1);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcnt_u8 (uint8x8_t __a)
++vcopy_laneq_u16 (uint16x4_t __a, const int __lane1,
++		 uint16x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlshv4si (__a, __b, __c);
-+  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmlah_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcntq_p8 (poly8x16_t __a)
++vcopy_laneq_u32 (uint32x2_t __a, const int __lane1,
++		 uint32x4_t __b, const int __lane2)
  {
 -  return  __builtin_aarch64_sqrdmlah_laneqv4hi (__a, __b, __c, __d);
-+  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmlah_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcntq_s8 (int8x16_t __a)
++vcopy_laneq_u64 (uint64x1_t __a, const int __lane1,
++		 uint64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_laneqv2si (__a, __b, __c, __d);
-+  return __builtin_aarch64_popcountv16qi (__a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmlahq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint8x16_t
++/* vcopyq_lane.  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcntq_u8 (uint8x16_t __a)
++vcopyq_lane_f32 (float32x4_t __a, const int __lane1,
++		 float32x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_laneqv8hi (__a, __b, __c, __d);
-+  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
++  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmlahq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
-+/* vcopy_lane.  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_f32 (float32x2_t __a, const int __lane1,
-+		float32x2_t __b, const int __lane2)
++vcopyq_lane_f64 (float64x2_t __a, const int __lane1,
++		 float64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_laneqv4si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmlsh_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_f64 (float64x1_t __a, const int __lane1,
-+		float64x1_t __b, const int __lane2)
++vcopyq_lane_p8 (poly8x16_t __a, const int __lane1,
++		poly8x8_t __b, const int __lane2)
  {
 -  return  __builtin_aarch64_sqrdmlsh_laneqv4hi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
@@ -23100,192 +24842,192 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmlsh_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_p8 (poly8x8_t __a, const int __lane1,
-+	       poly8x8_t __b, const int __lane2)
++vcopyq_lane_p16 (poly16x8_t __a, const int __lane1,
++		 poly16x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_laneqv2si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmlshq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline poly16x4_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_p16 (poly16x4_t __a, const int __lane1,
-+		poly16x4_t __b, const int __lane2)
++vcopyq_lane_p64 (poly64x2_t __a, const int __lane1,
++		 poly64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_laneqv8hi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmlshq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s8 (int8x8_t __a, const int __lane1,
-+	       int8x8_t __b, const int __lane2)
++vcopyq_lane_s8 (int8x16_t __a, const int __lane1,
++		int8x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_laneqv4si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s16 (int16x4_t __a, const int __lane1,
-+		int16x4_t __b, const int __lane2)
++vcopyq_lane_s16 (int16x8_t __a, const int __lane1,
++		 int16x4_t __b, const int __lane2)
  {
 -  return  __builtin_aarch64_sqrdmlah_lanev4hi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s32 (int32x2_t __a, const int __lane1,
-+		int32x2_t __b, const int __lane2)
++vcopyq_lane_s32 (int32x4_t __a, const int __lane1,
++		 int32x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_lanev2si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s64 (int64x1_t __a, const int __lane1,
-+		int64x1_t __b, const int __lane2)
++vcopyq_lane_s64 (int64x2_t __a, const int __lane1,
++		 int64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_lanev8hi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u8 (uint8x8_t __a, const int __lane1,
-+	       uint8x8_t __b, const int __lane2)
++vcopyq_lane_u8 (uint8x16_t __a, const int __lane1,
++		uint8x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_lanev4si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqrdmlahh_s16 (int16_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u16 (uint16x4_t __a, const int __lane1,
-+		uint16x4_t __b, const int __lane2)
++vcopyq_lane_u16 (uint16x8_t __a, const int __lane1,
++		 uint16x4_t __b, const int __lane2)
  {
 -  return (int16_t) __builtin_aarch64_sqrdmlahhi (__a, __b, __c);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqrdmlahh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u32 (uint32x2_t __a, const int __lane1,
-+		uint32x2_t __b, const int __lane2)
++vcopyq_lane_u32 (uint32x4_t __a, const int __lane1,
++		 uint32x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_lanehi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqrdmlahh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u64 (uint64x1_t __a, const int __lane1,
-+		uint64x1_t __b, const int __lane2)
++vcopyq_lane_u64 (uint64x2_t __a, const int __lane1,
++		 uint64x1_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_laneqhi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqrdmlahs_s32 (int32_t __a, int32_t __b, int32_t __c)
-+/* vcopy_laneq.  */
++/* vcopyq_laneq.  */
 +
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_f32 (float32x2_t __a, const int __lane1,
-+		 float32x4_t __b, const int __lane2)
++vcopyq_laneq_f32 (float32x4_t __a, const int __lane1,
++		  float32x4_t __b, const int __lane2)
  {
 -  return (int32_t) __builtin_aarch64_sqrdmlahsi (__a, __b, __c);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqrdmlahs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_f64 (float64x1_t __a, const int __lane1,
-+		 float64x2_t __b, const int __lane2)
++vcopyq_laneq_f64 (float64x2_t __a, const int __lane1,
++		  float64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_lanesi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqrdmlahs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_p8 (poly8x8_t __a, const int __lane1,
-+		poly8x16_t __b, const int __lane2)
++vcopyq_laneq_p8 (poly8x16_t __a, const int __lane1,
++		 poly8x16_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlah_laneqsi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
++				  __a, __lane1);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline poly16x4_t
++__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_p16 (poly16x4_t __a, const int __lane1,
-+		 poly16x8_t __b, const int __lane2)
++vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1,
++		  poly16x8_t __b, const int __lane2)
  {
 -  return  __builtin_aarch64_sqrdmlsh_lanev4hi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s8 (int8x8_t __a, const int __lane1,
-+		int8x16_t __b, const int __lane2)
++vcopyq_laneq_p64 (poly64x2_t __a, const int __lane1,
++		  poly64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_lanev2si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s16 (int16x4_t __a, const int __lane1,
-+		 int16x8_t __b, const int __lane2)
++vcopyq_laneq_s8 (int8x16_t __a, const int __lane1,
++		 int8x16_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_lanev8hi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
@@ -23294,46 +25036,46 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s32 (int32x2_t __a, const int __lane1,
-+		 int32x4_t __b, const int __lane2)
++vcopyq_laneq_s16 (int16x8_t __a, const int __lane1,
++		  int16x8_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_lanev4si (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqrdmlshh_s16 (int16_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s64 (int64x1_t __a, const int __lane1,
-+		 int64x2_t __b, const int __lane2)
++vcopyq_laneq_s32 (int32x4_t __a, const int __lane1,
++		  int32x4_t __b, const int __lane2)
  {
 -  return (int16_t) __builtin_aarch64_sqrdmlshhi (__a, __b, __c);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqrdmlshh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u8 (uint8x8_t __a, const int __lane1,
-+		uint8x16_t __b, const int __lane2)
++vcopyq_laneq_s64 (int64x2_t __a, const int __lane1,
++		  int64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_lanehi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqrdmlshh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u16 (uint16x4_t __a, const int __lane1,
-+		 uint16x8_t __b, const int __lane2)
++vcopyq_laneq_u8 (uint8x16_t __a, const int __lane1,
++		 uint8x16_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_laneqhi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
@@ -23342,36 +25084,34 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqrdmlshs_s32 (int32_t __a, int32_t __b, int32_t __c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u32 (uint32x2_t __a, const int __lane1,
-+		 uint32x4_t __b, const int __lane2)
++vcopyq_laneq_u16 (uint16x8_t __a, const int __lane1,
++		  uint16x8_t __b, const int __lane2)
  {
 -  return (int32_t) __builtin_aarch64_sqrdmlshsi (__a, __b, __c);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqrdmlshs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u64 (uint64x1_t __a, const int __lane1,
-+		 uint64x2_t __b, const int __lane2)
++vcopyq_laneq_u32 (uint32x4_t __a, const int __lane1,
++		  uint32x4_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_lanesi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++				   __a, __lane1);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqrdmlshs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
-+/* vcopyq_lane.  */
-+
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_f32 (float32x4_t __a, const int __lane1,
-+		 float32x2_t __b, const int __lane2)
++vcopyq_laneq_u64 (uint64x2_t __a, const int __lane1,
++		  uint64x2_t __b, const int __lane2)
  {
 -  return __builtin_aarch64_sqrdmlsh_laneqsi (__a, __b, __c, __d);
 +  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
@@ -23382,2174 +25122,2116 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -#pragma GCC push_options
 -#pragma GCC target ("+nothing+crypto")
 -/* vaes  */
--
++/* vcvt (double -> float).  */
+ 
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vaeseq_u8 (uint8x16_t data, uint8x16_t key)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_f64 (float64x2_t __a, const int __lane1,
-+		 float64x1_t __b, const int __lane2)
++vcvt_f16_f32 (float32x4_t __a)
  {
 -  return __builtin_aarch64_crypto_aesev16qi_uuu (data, key);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vaesdq_u8 (uint8x16_t data, uint8x16_t key)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_p8 (poly8x16_t __a, const int __lane1,
-+		poly8x8_t __b, const int __lane2)
++vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
  {
 -  return __builtin_aarch64_crypto_aesdv16qi_uuu (data, key);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vaesmcq_u8 (uint8x16_t data)
-+__extension__ extern __inline poly16x8_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_p16 (poly16x8_t __a, const int __lane1,
-+		 poly16x4_t __b, const int __lane2)
++vcvt_f32_f64 (float64x2_t __a)
  {
 -  return __builtin_aarch64_crypto_aesmcv16qi_uu (data);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vaesimcq_u8 (uint8x16_t data)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s8 (int8x16_t __a, const int __lane1,
-+		int8x8_t __b, const int __lane2)
++vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
  {
 -  return __builtin_aarch64_crypto_aesimcv16qi_uu (data);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
  }
 -#pragma GCC pop_options
  
 -/* vcage  */
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s16 (int16x8_t __a, const int __lane1,
-+		 int16x4_t __b, const int __lane2)
-+{
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
-+}
++/* vcvt (float -> double).  */
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcage_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s32 (int32x4_t __a, const int __lane1,
-+		 int32x2_t __b, const int __lane2)
++vcvt_f32_f16 (float16x4_t __a)
  {
 -  return vabs_f64 (__a) >= vabs_f64 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_float_extend_lo_v4sf (__a);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcages_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s64 (int64x2_t __a, const int __lane1,
-+		 int64x1_t __b, const int __lane2)
++vcvt_f64_f32 (float32x2_t __a)
  {
 -  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++
++  return __builtin_aarch64_float_extend_lo_v2df (__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcage_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u8 (uint8x16_t __a, const int __lane1,
-+		uint8x8_t __b, const int __lane2)
++vcvt_high_f32_f16 (float16x8_t __a)
  {
 -  return vabs_f32 (__a) >= vabs_f32 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcageq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u16 (uint16x8_t __a, const int __lane1,
-+		 uint16x4_t __b, const int __lane2)
++vcvt_high_f64_f32 (float32x4_t __a)
  {
 -  return vabsq_f32 (__a) >= vabsq_f32 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcaged_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint32x4_t
++/* vcvt (<u>fixed-point -> float).  */
++
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u32 (uint32x4_t __a, const int __lane1,
-+		 uint32x2_t __b, const int __lane2)
++vcvtd_n_f64_s64 (int64_t __a, const int __b)
  {
 -  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_scvtfdi (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcageq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u64 (uint64x2_t __a, const int __lane1,
-+		 uint64x1_t __b, const int __lane2)
++vcvtd_n_f64_u64 (uint64_t __a, const int __b)
  {
 -  return vabsq_f64 (__a) >= vabsq_f64 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_ucvtfdi_sus (__a, __b);
  }
  
 -/* vcagt  */
-+/* vcopyq_laneq.  */
- 
+-
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcagts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_f32 (float32x4_t __a, const int __lane1,
-+		  float32x4_t __b, const int __lane2)
++vcvts_n_f32_s32 (int32_t __a, const int __b)
  {
 -  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_scvtfsi (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcagt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_f64 (float64x2_t __a, const int __lane1,
-+		  float64x2_t __b, const int __lane2)
++vcvts_n_f32_u32 (uint32_t __a, const int __b)
  {
 -  return vabs_f32 (__a) > vabs_f32 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_ucvtfsi_sus (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcagt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_p8 (poly8x16_t __a, const int __lane1,
-+		 poly8x16_t __b, const int __lane2)
++vcvt_n_f32_s32 (int32x2_t __a, const int __b)
  {
 -  return vabs_f64 (__a) > vabs_f64 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++  return __builtin_aarch64_scvtfv2si (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcagtq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline poly16x8_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1,
-+		  poly16x8_t __b, const int __lane2)
++vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
  {
 -  return vabsq_f32 (__a) > vabsq_f32 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcagtd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s8 (int8x16_t __a, const int __lane1,
-+		 int8x16_t __b, const int __lane2)
++vcvt_n_f64_s64 (int64x1_t __a, const int __b)
  {
 -  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++  return (float64x1_t)
++    { __builtin_aarch64_scvtfdi (vget_lane_s64 (__a, 0), __b) };
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcagtq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s16 (int16x8_t __a, const int __lane1,
-+		  int16x8_t __b, const int __lane2)
++vcvt_n_f64_u64 (uint64x1_t __a, const int __b)
  {
 -  return vabsq_f64 (__a) > vabsq_f64 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return (float64x1_t)
++    { __builtin_aarch64_ucvtfdi_sus (vget_lane_u64 (__a, 0), __b) };
  }
  
 -/* vcale  */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s32 (int32x4_t __a, const int __lane1,
-+		  int32x4_t __b, const int __lane2)
-+{
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
-+}
- 
+-
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcale_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s64 (int64x2_t __a, const int __lane1,
-+		  int64x2_t __b, const int __lane2)
++vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
  {
 -  return vabs_f32 (__a) <= vabs_f32 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_scvtfv4si (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcale_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u8 (uint8x16_t __a, const int __lane1,
-+		 uint8x16_t __b, const int __lane2)
++vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
  {
 -  return vabs_f64 (__a) <= vabs_f64 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
++  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcaled_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u16 (uint16x8_t __a, const int __lane1,
-+		  uint16x8_t __b, const int __lane2)
++vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
  {
 -  return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_scvtfv2di (__a, __b);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcales_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u32 (uint32x4_t __a, const int __lane1,
-+		  uint32x4_t __b, const int __lane2)
++vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
  {
 -  return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcaleq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint64x2_t
++/* vcvt (float -> <u>fixed-point).  */
++
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u64 (uint64x2_t __a, const int __lane1,
-+		  uint64x2_t __b, const int __lane2)
++vcvtd_n_s64_f64 (float64_t __a, const int __b)
  {
 -  return vabsq_f32 (__a) <= vabsq_f32 (__b);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
++  return __builtin_aarch64_fcvtzsdf (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcaleq_f64 (float64x2_t __a, float64x2_t __b)
-+/* vcvt (double -> float).  */
-+
-+__extension__ extern __inline float16x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f16_f32 (float32x4_t __a)
++vcvtd_n_u64_f64 (float64_t __a, const int __b)
  {
 -  return vabsq_f64 (__a) <= vabsq_f64 (__b);
-+  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
++  return __builtin_aarch64_fcvtzudf_uss (__a, __b);
  }
  
 -/* vcalt  */
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
-+{
-+  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
-+}
- 
+-
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcalt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_f64 (float64x2_t __a)
++vcvts_n_s32_f32 (float32_t __a, const int __b)
  {
 -  return vabs_f32 (__a) < vabs_f32 (__b);
-+  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
++  return __builtin_aarch64_fcvtzssf (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcalt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
++vcvts_n_u32_f32 (float32_t __a, const int __b)
  {
 -  return vabs_f64 (__a) < vabs_f64 (__b);
-+  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
++  return __builtin_aarch64_fcvtzusf_uss (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcaltd_f64 (float64_t __a, float64_t __b)
-+/* vcvt (float -> double).  */
-+
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_f16 (float16x4_t __a)
++vcvt_n_s32_f32 (float32x2_t __a, const int __b)
  {
 -  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
-+  return __builtin_aarch64_float_extend_lo_v4sf (__a);
++  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcaltq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f64_f32 (float32x2_t __a)
++vcvt_n_u32_f32 (float32x2_t __a, const int __b)
  {
 -  return vabsq_f32 (__a) < vabsq_f32 (__b);
-+
-+  return __builtin_aarch64_float_extend_lo_v2df (__a);
++  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcaltq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f32_f16 (float16x8_t __a)
++vcvt_n_s64_f64 (float64x1_t __a, const int __b)
  {
 -  return vabsq_f64 (__a) < vabsq_f64 (__b);
-+  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
++  return (int64x1_t)
++    { __builtin_aarch64_fcvtzsdf (vget_lane_f64 (__a, 0), __b) };
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcalts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f64_f32 (float32x4_t __a)
++vcvt_n_u64_f64 (float64x1_t __a, const int __b)
  {
 -  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
-+  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
++  return (uint64x1_t)
++    { __builtin_aarch64_fcvtzudf_uss (vget_lane_f64 (__a, 0), __b) };
  }
  
 -/* vceq - vector.  */
-+/* vcvt (<u>fixed-point -> float).  */
- 
+-
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vceq_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_f64_s64 (int64_t __a, const int __b)
++vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a == __b);
-+  return __builtin_aarch64_scvtfdi (__a, __b);
++  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vceq_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_f64_u64 (uint64_t __a, const int __b)
++vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a == __b);
-+  return __builtin_aarch64_ucvtfdi_sus (__a, __b);
++  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vceq_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_f32_s32 (int32_t __a, const int __b)
++vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
  {
 -  return (uint8x8_t) (__a == __b);
-+  return __builtin_aarch64_scvtfsi (__a, __b);
++  return __builtin_aarch64_fcvtzsv2df (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vceq_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_f32_u32 (uint32_t __a, const int __b)
++vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
  {
 -  return (uint8x8_t) (__a == __b);
-+  return __builtin_aarch64_ucvtfsi_sus (__a, __b);
++  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vceq_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float32x2_t
++/* vcvt  (<u>int -> float)  */
++
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
++vcvtd_f64_s64 (int64_t __a)
  {
 -  return (uint16x4_t) (__a == __b);
-+  return __builtin_aarch64_scvtfv2si (__a, __b);
++  return (float64_t) __a;
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vceq_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
++vcvtd_f64_u64 (uint64_t __a)
  {
 -  return (uint32x2_t) (__a == __b);
-+  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
++  return (float64_t) __a;
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vceq_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f64_s64 (int64x1_t __a, const int __b)
++vcvts_f32_s32 (int32_t __a)
  {
 -  return (uint64x1_t) (__a == __b);
-+  return (float64x1_t)
-+    { __builtin_aarch64_scvtfdi (vget_lane_s64 (__a, 0), __b) };
++  return (float32_t) __a;
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vceq_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f64_u64 (uint64x1_t __a, const int __b)
++vcvts_f32_u32 (uint32_t __a)
  {
 -  return (__a == __b);
-+  return (float64x1_t)
-+    { __builtin_aarch64_ucvtfdi_sus (vget_lane_u64 (__a, 0), __b) };
++  return (float32_t) __a;
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vceq_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
++vcvt_f32_s32 (int32x2_t __a)
  {
 -  return (__a == __b);
-+  return __builtin_aarch64_scvtfv4si (__a, __b);
++  return __builtin_aarch64_floatv2siv2sf (__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vceq_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
++vcvt_f32_u32 (uint32x2_t __a)
  {
 -  return (__a == __b);
-+  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
++  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vceq_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
++vcvt_f64_s64 (int64x1_t __a)
  {
 -  return (__a == __b);
-+  return __builtin_aarch64_scvtfv2di (__a, __b);
++  return (float64x1_t) { vget_lane_s64 (__a, 0) };
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vceqq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
++vcvt_f64_u64 (uint64x1_t __a)
  {
 -  return (uint32x4_t) (__a == __b);
-+  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
++  return (float64x1_t) { vget_lane_u64 (__a, 0) };
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vceqq_f64 (float64x2_t __a, float64x2_t __b)
-+/* vcvt (float -> <u>fixed-point).  */
-+
-+__extension__ extern __inline int64_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_s64_f64 (float64_t __a, const int __b)
++vcvtq_f32_s32 (int32x4_t __a)
  {
 -  return (uint64x2_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzsdf (__a, __b);
++  return __builtin_aarch64_floatv4siv4sf (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_u64_f64 (float64_t __a, const int __b)
++vcvtq_f32_u32 (uint32x4_t __a)
  {
 -  return (uint8x16_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzudf_uss (__a, __b);
++  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vceqq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_s32_f32 (float32_t __a, const int __b)
++vcvtq_f64_s64 (int64x2_t __a)
  {
 -  return (uint8x16_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzssf (__a, __b);
++  return __builtin_aarch64_floatv2div2df (__a);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vceqq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_u32_f32 (float32_t __a, const int __b)
++vcvtq_f64_u64 (uint64x2_t __a)
  {
 -  return (uint16x8_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzusf_uss (__a, __b);
++  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vceqq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int32x2_t
++/* vcvt (float -> <u>int)  */
++
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
++vcvtd_s64_f64 (float64_t __a)
  {
 -  return (uint32x4_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
++  return (int64_t) __a;
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vceqq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
++vcvtd_u64_f64 (float64_t __a)
  {
 -  return (uint64x2_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
++  return (uint64_t) __a;
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_s64_f64 (float64x1_t __a, const int __b)
++vcvts_s32_f32 (float32_t __a)
  {
 -  return (__a == __b);
-+  return (int64x1_t)
-+    { __builtin_aarch64_fcvtzsdf (vget_lane_f64 (__a, 0), __b) };
++  return (int32_t) __a;
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_u64_f64 (float64x1_t __a, const int __b)
++vcvts_u32_f32 (float32_t __a)
  {
 -  return (__a == __b);
-+  return (uint64x1_t)
-+    { __builtin_aarch64_fcvtzudf_uss (vget_lane_f64 (__a, 0), __b) };
++  return (uint32_t) __a;
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
++vcvt_s32_f32 (float32x2_t __a)
  {
 -  return (__a == __b);
-+  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
++  return __builtin_aarch64_lbtruncv2sfv2si (__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
++vcvt_u32_f32 (float32x2_t __a)
  {
 -  return (__a == __b);
-+  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
++  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
  }
  
 -/* vceq - scalar.  */
 -
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vceqs_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
++vcvtq_s32_f32 (float32x4_t __a)
  {
 -  return __a == __b ? -1 : 0;
-+  return __builtin_aarch64_fcvtzsv2df (__a, __b);
++  return __builtin_aarch64_lbtruncv4sfv4si (__a);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vceqd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
++vcvtq_u32_f32 (float32x4_t __a)
  {
 -  return __a == __b ? -1ll : 0ll;
-+  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
++  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vceqd_u64 (uint64_t __a, uint64_t __b)
-+/* vcvt  (<u>int -> float)  */
-+
-+__extension__ extern __inline float64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_f64_s64 (int64_t __a)
++vcvt_s64_f64 (float64x1_t __a)
  {
 -  return __a == __b ? -1ll : 0ll;
-+  return (float64_t) __a;
++  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vceqd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_f64_u64 (uint64_t __a)
++vcvt_u64_f64 (float64x1_t __a)
  {
 -  return __a == __b ? -1ll : 0ll;
-+  return (float64_t) __a;
++  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
  }
  
 -/* vceqz - vector.  */
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vceqz_f32 (float32x2_t __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_f32_s32 (int32_t __a)
++vcvtq_s64_f64 (float64x2_t __a)
  {
 -  return (uint32x2_t) (__a == 0.0f);
-+  return (float32_t) __a;
++  return __builtin_aarch64_lbtruncv2dfv2di (__a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vceqz_f64 (float64x1_t __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_f32_u32 (uint32_t __a)
++vcvtq_u64_f64 (float64x2_t __a)
  {
 -  return (uint64x1_t) (__a == (float64x1_t) {0.0});
-+  return (float32_t) __a;
++  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vceqz_p8 (poly8x8_t __a)
-+__extension__ extern __inline float32x2_t
++/* vcvta  */
++
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_s32 (int32x2_t __a)
++vcvtad_s64_f64 (float64_t __a)
  {
 -  return (uint8x8_t) (__a == 0);
-+  return __builtin_aarch64_floatv2siv2sf (__a);
++  return __builtin_aarch64_lrounddfdi (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vceqz_s8 (int8x8_t __a)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_u32 (uint32x2_t __a)
++vcvtad_u64_f64 (float64_t __a)
  {
 -  return (uint8x8_t) (__a == 0);
-+  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
++  return __builtin_aarch64_lroundudfdi_us (__a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vceqz_s16 (int16x4_t __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f64_s64 (int64x1_t __a)
++vcvtas_s32_f32 (float32_t __a)
  {
 -  return (uint16x4_t) (__a == 0);
-+  return (float64x1_t) { vget_lane_s64 (__a, 0) };
++  return __builtin_aarch64_lroundsfsi (__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vceqz_s32 (int32x2_t __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f64_u64 (uint64x1_t __a)
++vcvtas_u32_f32 (float32_t __a)
  {
 -  return (uint32x2_t) (__a == 0);
-+  return (float64x1_t) { vget_lane_u64 (__a, 0) };
++  return __builtin_aarch64_lroundusfsi_us (__a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vceqz_s64 (int64x1_t __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f32_s32 (int32x4_t __a)
++vcvta_s32_f32 (float32x2_t __a)
  {
 -  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_floatv4siv4sf (__a);
++  return __builtin_aarch64_lroundv2sfv2si (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vceqz_u8 (uint8x8_t __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f32_u32 (uint32x4_t __a)
++vcvta_u32_f32 (float32x2_t __a)
  {
 -  return (__a == 0);
-+  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
++  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vceqz_u16 (uint16x4_t __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f64_s64 (int64x2_t __a)
++vcvtaq_s32_f32 (float32x4_t __a)
  {
 -  return (__a == 0);
-+  return __builtin_aarch64_floatv2div2df (__a);
++  return __builtin_aarch64_lroundv4sfv4si (__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vceqz_u32 (uint32x2_t __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f64_u64 (uint64x2_t __a)
++vcvtaq_u32_f32 (float32x4_t __a)
  {
 -  return (__a == 0);
-+  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
++  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vceqz_u64 (uint64x1_t __a)
-+/* vcvt (float -> <u>int)  */
-+
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_s64_f64 (float64_t __a)
++vcvta_s64_f64 (float64x1_t __a)
  {
 -  return (__a == __AARCH64_UINT64_C (0));
-+  return (int64_t) __a;
++  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vceqzq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_u64_f64 (float64_t __a)
++vcvta_u64_f64 (float64x1_t __a)
  {
 -  return (uint32x4_t) (__a == 0.0f);
-+  return (uint64_t) __a;
++  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vceqzq_f64 (float64x2_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_s32_f32 (float32_t __a)
++vcvtaq_s64_f64 (float64x2_t __a)
  {
 -  return (uint64x2_t) (__a == 0.0f);
-+  return (int32_t) __a;
++  return __builtin_aarch64_lroundv2dfv2di (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vceqzq_p8 (poly8x16_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_u32_f32 (float32_t __a)
++vcvtaq_u64_f64 (float64x2_t __a)
  {
 -  return (uint8x16_t) (__a == 0);
-+  return (uint32_t) __a;
++  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vceqzq_s8 (int8x16_t __a)
-+__extension__ extern __inline int32x2_t
++/* vcvtm  */
++
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_s32_f32 (float32x2_t __a)
++vcvtmd_s64_f64 (float64_t __a)
  {
 -  return (uint8x16_t) (__a == 0);
-+  return __builtin_aarch64_lbtruncv2sfv2si (__a);
++  return __builtin_llfloor (__a);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vceqzq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_u32_f32 (float32x2_t __a)
++vcvtmd_u64_f64 (float64_t __a)
  {
 -  return (uint16x8_t) (__a == 0);
-+  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
++  return __builtin_aarch64_lfloorudfdi_us (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vceqzq_s32 (int32x4_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_s32_f32 (float32x4_t __a)
++vcvtms_s32_f32 (float32_t __a)
  {
 -  return (uint32x4_t) (__a == 0);
-+  return __builtin_aarch64_lbtruncv4sfv4si (__a);
++  return __builtin_ifloorf (__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vceqzq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_u32_f32 (float32x4_t __a)
++vcvtms_u32_f32 (float32_t __a)
  {
 -  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
++  return __builtin_aarch64_lfloorusfsi_us (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vceqzq_u8 (uint8x16_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_s64_f64 (float64x1_t __a)
++vcvtm_s32_f32 (float32x2_t __a)
  {
 -  return (__a == 0);
-+  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
++  return __builtin_aarch64_lfloorv2sfv2si (__a);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vceqzq_u16 (uint16x8_t __a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_u64_f64 (float64x1_t __a)
++vcvtm_u32_f32 (float32x2_t __a)
  {
 -  return (__a == 0);
-+  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
++  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vceqzq_u32 (uint32x4_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_s64_f64 (float64x2_t __a)
++vcvtmq_s32_f32 (float32x4_t __a)
  {
 -  return (__a == 0);
-+  return __builtin_aarch64_lbtruncv2dfv2di (__a);
++  return __builtin_aarch64_lfloorv4sfv4si (__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vceqzq_u64 (uint64x2_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_u64_f64 (float64x2_t __a)
++vcvtmq_u32_f32 (float32x4_t __a)
  {
 -  return (__a == __AARCH64_UINT64_C (0));
-+  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
++  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
  }
  
 -/* vceqz - scalar.  */
-+/* vcvta  */
- 
+-
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vceqzs_f32 (float32_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtad_s64_f64 (float64_t __a)
++vcvtm_s64_f64 (float64x1_t __a)
  {
 -  return __a == 0.0f ? -1 : 0;
-+  return __builtin_aarch64_lrounddfdi (__a);
++  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vceqzd_s64 (int64_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtad_u64_f64 (float64_t __a)
++vcvtm_u64_f64 (float64x1_t __a)
  {
 -  return __a == 0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lroundudfdi_us (__a);
++  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vceqzd_u64 (uint64_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtas_s32_f32 (float32_t __a)
++vcvtmq_s64_f64 (float64x2_t __a)
  {
 -  return __a == 0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lroundsfsi (__a);
++  return __builtin_aarch64_lfloorv2dfv2di (__a);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vceqzd_f64 (float64_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtas_u32_f32 (float32_t __a)
++vcvtmq_u64_f64 (float64x2_t __a)
  {
 -  return __a == 0.0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lroundusfsi_us (__a);
++  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
  }
  
 -/* vcge - vector.  */
--
++/* vcvtn  */
+ 
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcge_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_s32_f32 (float32x2_t __a)
++vcvtnd_s64_f64 (float64_t __a)
  {
 -  return (uint32x2_t) (__a >= __b);
-+  return __builtin_aarch64_lroundv2sfv2si (__a);
++  return __builtin_aarch64_lfrintndfdi (__a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcge_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_u32_f32 (float32x2_t __a)
++vcvtnd_u64_f64 (float64_t __a)
  {
 -  return (uint64x1_t) (__a >= __b);
-+  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
++  return __builtin_aarch64_lfrintnudfdi_us (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcge_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_s32_f32 (float32x4_t __a)
++vcvtns_s32_f32 (float32_t __a)
  {
 -  return (uint8x8_t) (__a >= __b);
-+  return __builtin_aarch64_lroundv4sfv4si (__a);
++  return __builtin_aarch64_lfrintnsfsi (__a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcge_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_u32_f32 (float32x4_t __a)
++vcvtns_u32_f32 (float32_t __a)
  {
 -  return (uint16x4_t) (__a >= __b);
-+  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
++  return __builtin_aarch64_lfrintnusfsi_us (__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcge_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_s64_f64 (float64x1_t __a)
++vcvtn_s32_f32 (float32x2_t __a)
  {
 -  return (uint32x2_t) (__a >= __b);
-+  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
++  return __builtin_aarch64_lfrintnv2sfv2si (__a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcge_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_u64_f64 (float64x1_t __a)
++vcvtn_u32_f32 (float32x2_t __a)
  {
 -  return (uint64x1_t) (__a >= __b);
-+  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
++  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcge_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_s64_f64 (float64x2_t __a)
++vcvtnq_s32_f32 (float32x4_t __a)
  {
 -  return (__a >= __b);
-+  return __builtin_aarch64_lroundv2dfv2di (__a);
++  return __builtin_aarch64_lfrintnv4sfv4si (__a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcge_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_u64_f64 (float64x2_t __a)
++vcvtnq_u32_f32 (float32x4_t __a)
  {
 -  return (__a >= __b);
-+  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
++  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcge_u32 (uint32x2_t __a, uint32x2_t __b)
-+/* vcvtm  */
-+
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmd_s64_f64 (float64_t __a)
++vcvtn_s64_f64 (float64x1_t __a)
  {
 -  return (__a >= __b);
-+  return __builtin_llfloor (__a);
++  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcge_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmd_u64_f64 (float64_t __a)
++vcvtn_u64_f64 (float64x1_t __a)
  {
 -  return (__a >= __b);
-+  return __builtin_aarch64_lfloorudfdi_us (__a);
++  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgeq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtms_s32_f32 (float32_t __a)
++vcvtnq_s64_f64 (float64x2_t __a)
  {
 -  return (uint32x4_t) (__a >= __b);
-+  return __builtin_ifloorf (__a);
++  return __builtin_aarch64_lfrintnv2dfv2di (__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgeq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtms_u32_f32 (float32_t __a)
++vcvtnq_u64_f64 (float64x2_t __a)
  {
 -  return (uint64x2_t) (__a >= __b);
-+  return __builtin_aarch64_lfloorusfsi_us (__a);
++  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcgeq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int32x2_t
++/* vcvtp  */
++
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_s32_f32 (float32x2_t __a)
++vcvtpd_s64_f64 (float64_t __a)
  {
 -  return (uint8x16_t) (__a >= __b);
-+  return __builtin_aarch64_lfloorv2sfv2si (__a);
++  return __builtin_llceil (__a);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcgeq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_u32_f32 (float32x2_t __a)
++vcvtpd_u64_f64 (float64_t __a)
  {
 -  return (uint16x8_t) (__a >= __b);
-+  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
++  return __builtin_aarch64_lceiludfdi_us (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgeq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_s32_f32 (float32x4_t __a)
++vcvtps_s32_f32 (float32_t __a)
  {
 -  return (uint32x4_t) (__a >= __b);
-+  return __builtin_aarch64_lfloorv4sfv4si (__a);
++  return __builtin_iceilf (__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgeq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_u32_f32 (float32x4_t __a)
++vcvtps_u32_f32 (float32_t __a)
  {
 -  return (uint64x2_t) (__a >= __b);
-+  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
++  return __builtin_aarch64_lceilusfsi_us (__a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_s64_f64 (float64x1_t __a)
++vcvtp_s32_f32 (float32x2_t __a)
  {
 -  return (__a >= __b);
-+  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
++  return __builtin_aarch64_lceilv2sfv2si (__a);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_u64_f64 (float64x1_t __a)
++vcvtp_u32_f32 (float32x2_t __a)
  {
 -  return (__a >= __b);
-+  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
++  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_s64_f64 (float64x2_t __a)
++vcvtpq_s32_f32 (float32x4_t __a)
  {
 -  return (__a >= __b);
-+  return __builtin_aarch64_lfloorv2dfv2di (__a);
++  return __builtin_aarch64_lceilv4sfv4si (__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_u64_f64 (float64x2_t __a)
++vcvtpq_u32_f32 (float32x4_t __a)
  {
 -  return (__a >= __b);
-+  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
++  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
  }
  
 -/* vcge - scalar.  */
-+/* vcvtn  */
- 
+-
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcges_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnd_s64_f64 (float64_t __a)
++vcvtp_s64_f64 (float64x1_t __a)
  {
 -  return __a >= __b ? -1 : 0;
-+  return __builtin_aarch64_lfrintndfdi (__a);
++  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcged_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnd_u64_f64 (float64_t __a)
++vcvtp_u64_f64 (float64x1_t __a)
  {
 -  return __a >= __b ? -1ll : 0ll;
-+  return __builtin_aarch64_lfrintnudfdi_us (__a);
++  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcged_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtns_s32_f32 (float32_t __a)
++vcvtpq_s64_f64 (float64x2_t __a)
  {
 -  return __a >= __b ? -1ll : 0ll;
-+  return __builtin_aarch64_lfrintnsfsi (__a);
++  return __builtin_aarch64_lceilv2dfv2di (__a);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcged_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtns_u32_f32 (float32_t __a)
++vcvtpq_u64_f64 (float64x2_t __a)
  {
 -  return __a >= __b ? -1ll : 0ll;
-+  return __builtin_aarch64_lfrintnusfsi_us (__a);
++  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
  }
  
 -/* vcgez - vector.  */
--
++/* vdup_n  */
+ 
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgez_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_s32_f32 (float32x2_t __a)
++vdup_n_f16 (float16_t __a)
  {
 -  return (uint32x2_t) (__a >= 0.0f);
-+  return __builtin_aarch64_lfrintnv2sfv2si (__a);
++  return (float16x4_t) {__a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgez_f64 (float64x1_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_u32_f32 (float32x2_t __a)
++vdup_n_f32 (float32_t __a)
  {
 -  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
-+  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
++  return (float32x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcgez_s8 (int8x8_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_s32_f32 (float32x4_t __a)
++vdup_n_f64 (float64_t __a)
  {
 -  return (uint8x8_t) (__a >= 0);
-+  return __builtin_aarch64_lfrintnv4sfv4si (__a);
++  return (float64x1_t) {__a};
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcgez_s16 (int16x4_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_u32_f32 (float32x4_t __a)
++vdup_n_p8 (poly8_t __a)
  {
 -  return (uint16x4_t) (__a >= 0);
-+  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
++  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgez_s32 (int32x2_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_s64_f64 (float64x1_t __a)
++vdup_n_p16 (poly16_t __a)
  {
 -  return (uint32x2_t) (__a >= 0);
-+  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
++  return (poly16x4_t) {__a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgez_s64 (int64x1_t __a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_u64_f64 (float64x1_t __a)
++vdup_n_p64 (poly64_t __a)
  {
 -  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
-+  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
++  return (poly64x1_t) {__a};
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgezq_f32 (float32x4_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_s64_f64 (float64x2_t __a)
++vdup_n_s8 (int8_t __a)
  {
 -  return (uint32x4_t) (__a >= 0.0f);
-+  return __builtin_aarch64_lfrintnv2dfv2di (__a);
++  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgezq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_u64_f64 (float64x2_t __a)
++vdup_n_s16 (int16_t __a)
  {
 -  return (uint64x2_t) (__a >= 0.0);
-+  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
++  return (int16x4_t) {__a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcgezq_s8 (int8x16_t __a)
-+/* vcvtp  */
-+
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpd_s64_f64 (float64_t __a)
++vdup_n_s32 (int32_t __a)
  {
 -  return (uint8x16_t) (__a >= 0);
-+  return __builtin_llceil (__a);
++  return (int32x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcgezq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpd_u64_f64 (float64_t __a)
++vdup_n_s64 (int64_t __a)
  {
 -  return (uint16x8_t) (__a >= 0);
-+  return __builtin_aarch64_lceiludfdi_us (__a);
++  return (int64x1_t) {__a};
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgezq_s32 (int32x4_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtps_s32_f32 (float32_t __a)
++vdup_n_u8 (uint8_t __a)
  {
 -  return (uint32x4_t) (__a >= 0);
-+  return __builtin_iceilf (__a);
++  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgezq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtps_u32_f32 (float32_t __a)
++vdup_n_u16 (uint16_t __a)
  {
 -  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_lceilusfsi_us (__a);
++  return (uint16x4_t) {__a, __a, __a, __a};
  }
  
 -/* vcgez - scalar.  */
 -
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcgezs_f32 (float32_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_s32_f32 (float32x2_t __a)
++vdup_n_u32 (uint32_t __a)
  {
 -  return __a >= 0.0f ? -1 : 0;
-+  return __builtin_aarch64_lceilv2sfv2si (__a);
++  return (uint32x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgezd_s64 (int64_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_u32_f32 (float32x2_t __a)
++vdup_n_u64 (uint64_t __a)
  {
 -  return __a >= 0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
++  return (uint64x1_t) {__a};
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgezd_f64 (float64_t __a)
-+__extension__ extern __inline int32x4_t
++/* vdupq_n  */
++
++__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_s32_f32 (float32x4_t __a)
++vdupq_n_f16 (float16_t __a)
  {
 -  return __a >= 0.0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lceilv4sfv4si (__a);
++  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -/* vcgt - vector.  */
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_u32_f32 (float32x4_t __a)
++vdupq_n_f32 (float32_t __a)
  {
 -  return (uint32x2_t) (__a > __b);
-+  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
++  return (float32x4_t) {__a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_s64_f64 (float64x1_t __a)
++vdupq_n_f64 (float64_t __a)
  {
 -  return (uint64x1_t) (__a > __b);
-+  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
++  return (float64x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcgt_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_u64_f64 (float64x1_t __a)
++vdupq_n_p8 (uint32_t __a)
  {
 -  return (uint8x8_t) (__a > __b);
-+  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
++  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
++		       __a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcgt_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_s64_f64 (float64x2_t __a)
++vdupq_n_p16 (uint32_t __a)
  {
 -  return (uint16x4_t) (__a > __b);
-+  return __builtin_aarch64_lceilv2dfv2di (__a);
++  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgt_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_u64_f64 (float64x2_t __a)
++vdupq_n_p64 (uint64_t __a)
  {
 -  return (uint32x2_t) (__a > __b);
-+  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
++  return (poly64x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgt_s64 (int64x1_t __a, int64x1_t __b)
-+/* vdup_n  */
-+
-+__extension__ extern __inline float16x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f16 (float16_t __a)
++vdupq_n_s8 (int32_t __a)
  {
 -  return (uint64x1_t) (__a > __b);
-+  return (float16x4_t) {__a, __a, __a, __a};
++  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
++		      __a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f32 (float32_t __a)
++vdupq_n_s16 (int32_t __a)
  {
 -  return (__a > __b);
-+  return (float32x2_t) {__a, __a};
++  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f64 (float64_t __a)
++vdupq_n_s32 (int32_t __a)
  {
 -  return (__a > __b);
-+  return (float64x1_t) {__a};
++  return (int32x4_t) {__a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_p8 (poly8_t __a)
++vdupq_n_s64 (int64_t __a)
  {
 -  return (__a > __b);
-+  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++  return (int64x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline poly16x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_p16 (poly16_t __a)
++vdupq_n_u8 (uint32_t __a)
  {
 -  return (__a > __b);
-+  return (poly16x4_t) {__a, __a, __a, __a};
++  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
++		       __a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgtq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s8 (int8_t __a)
++vdupq_n_u16 (uint32_t __a)
  {
 -  return (uint32x4_t) (__a > __b);
-+  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgtq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s16 (int16_t __a)
++vdupq_n_u32 (uint32_t __a)
  {
 -  return (uint64x2_t) (__a > __b);
-+  return (int16x4_t) {__a, __a, __a, __a};
++  return (uint32x4_t) {__a, __a, __a, __a};
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcgtq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s32 (int32_t __a)
++vdupq_n_u64 (uint64_t __a)
  {
 -  return (uint8x16_t) (__a > __b);
-+  return (int32x2_t) {__a, __a};
++  return (uint64x2_t) {__a, __a};
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcgtq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline int64x1_t
++/* vdup_lane  */
++
++__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s64 (int64_t __a)
++vdup_lane_f16 (float16x4_t __a, const int __b)
  {
 -  return (uint16x8_t) (__a > __b);
-+  return (int64x1_t) {__a};
++  return __aarch64_vdup_lane_f16 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgtq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u8 (uint8_t __a)
++vdup_lane_f32 (float32x2_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a > __b);
-+  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_lane_f32 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgtq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u16 (uint16_t __a)
++vdup_lane_f64 (float64x1_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a > __b);
-+  return (uint16x4_t) {__a, __a, __a, __a};
++  return __aarch64_vdup_lane_f64 (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u32 (uint32_t __a)
++vdup_lane_p8 (poly8x8_t __a, const int __b)
  {
 -  return (__a > __b);
-+  return (uint32x2_t) {__a, __a};
++  return __aarch64_vdup_lane_p8 (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u64 (uint64_t __a)
++vdup_lane_p16 (poly16x4_t __a, const int __b)
  {
 -  return (__a > __b);
-+  return (uint64x1_t) {__a};
++  return __aarch64_vdup_lane_p16 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
-+/* vdupq_n  */
-+
-+__extension__ extern __inline float16x8_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f16 (float16_t __a)
++vdup_lane_p64 (poly64x1_t __a, const int __b)
  {
 -  return (__a > __b);
-+  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_lane_p64 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f32 (float32_t __a)
++vdup_lane_s8 (int8x8_t __a, const int __b)
  {
 -  return (__a > __b);
-+  return (float32x4_t) {__a, __a, __a, __a};
++  return __aarch64_vdup_lane_s8 (__a, __b);
  }
  
 -/* vcgt - scalar.  */
 -
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcgts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f64 (float64_t __a)
++vdup_lane_s16 (int16x4_t __a, const int __b)
  {
 -  return __a > __b ? -1 : 0;
-+  return (float64x2_t) {__a, __a};
++  return __aarch64_vdup_lane_s16 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgtd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_p8 (uint32_t __a)
++vdup_lane_s32 (int32x2_t __a, const int __b)
  {
 -  return __a > __b ? -1ll : 0ll;
-+  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-+		       __a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_lane_s32 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgtd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline poly16x8_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_p16 (uint32_t __a)
++vdup_lane_s64 (int64x1_t __a, const int __b)
  {
 -  return __a > __b ? -1ll : 0ll;
-+  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_lane_s64 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgtd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s8 (int32_t __a)
++vdup_lane_u8 (uint8x8_t __a, const int __b)
  {
 -  return __a > __b ? -1ll : 0ll;
-+  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-+		      __a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_lane_u8 (__a, __b);
  }
  
 -/* vcgtz - vector.  */
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s16 (int32_t __a)
-+{
-+  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+}
- 
+-
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgtz_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s32 (int32_t __a)
++vdup_lane_u16 (uint16x4_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a > 0.0f);
-+  return (int32x4_t) {__a, __a, __a, __a};
++  return __aarch64_vdup_lane_u16 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgtz_f64 (float64x1_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s64 (int64_t __a)
++vdup_lane_u32 (uint32x2_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a > (float64x1_t) {0.0});
-+  return (int64x2_t) {__a, __a};
++  return __aarch64_vdup_lane_u32 (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcgtz_s8 (int8x8_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u8 (uint32_t __a)
++vdup_lane_u64 (uint64x1_t __a, const int __b)
  {
 -  return (uint8x8_t) (__a > 0);
-+  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-+		       __a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_lane_u64 (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcgtz_s16 (int16x4_t __a)
-+__extension__ extern __inline uint16x8_t
++/* vdup_laneq  */
++
++__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u16 (uint32_t __a)
++vdup_laneq_f16 (float16x8_t __a, const int __b)
  {
 -  return (uint16x4_t) (__a > 0);
-+  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++  return __aarch64_vdup_laneq_f16 (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcgtz_s32 (int32x2_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u32 (uint32_t __a)
++vdup_laneq_f32 (float32x4_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a > 0);
-+  return (uint32x4_t) {__a, __a, __a, __a};
++  return __aarch64_vdup_laneq_f32 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcgtz_s64 (int64x1_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u64 (uint64_t __a)
++vdup_laneq_f64 (float64x2_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
-+  return (uint64x2_t) {__a, __a};
++  return __aarch64_vdup_laneq_f64 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgtzq_f32 (float32x4_t __a)
-+/* vdup_lane  */
-+
-+__extension__ extern __inline float16x4_t
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f16 (float16x4_t __a, const int __b)
++vdup_laneq_p8 (poly8x16_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a > 0.0f);
-+  return __aarch64_vdup_lane_f16 (__a, __b);
++  return __aarch64_vdup_laneq_p8 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgtzq_f64 (float64x2_t __a)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f32 (float32x2_t __a, const int __b)
++vdup_laneq_p16 (poly16x8_t __a, const int __b)
  {
 -    return (uint64x2_t) (__a > 0.0);
-+  return __aarch64_vdup_lane_f32 (__a, __b);
++  return __aarch64_vdup_laneq_p16 (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcgtzq_s8 (int8x16_t __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f64 (float64x1_t __a, const int __b)
++vdup_laneq_p64 (poly64x2_t __a, const int __b)
  {
 -  return (uint8x16_t) (__a > 0);
-+  return __aarch64_vdup_lane_f64 (__a, __b);
++  return __aarch64_vdup_laneq_p64 (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcgtzq_s16 (int16x8_t __a)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_p8 (poly8x8_t __a, const int __b)
++vdup_laneq_s8 (int8x16_t __a, const int __b)
  {
 -  return (uint16x8_t) (__a > 0);
-+  return __aarch64_vdup_lane_p8 (__a, __b);
++  return __aarch64_vdup_laneq_s8 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcgtzq_s32 (int32x4_t __a)
-+__extension__ extern __inline poly16x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_p16 (poly16x4_t __a, const int __b)
++vdup_laneq_s16 (int16x8_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a > 0);
-+  return __aarch64_vdup_lane_p16 (__a, __b);
++  return __aarch64_vdup_laneq_s16 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcgtzq_s64 (int64x2_t __a)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s8 (int8x8_t __a, const int __b)
++vdup_laneq_s32 (int32x4_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
-+  return __aarch64_vdup_lane_s8 (__a, __b);
++  return __aarch64_vdup_laneq_s32 (__a, __b);
  }
  
 -/* vcgtz - scalar.  */
--
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdup_laneq_s64 (int64x2_t __a, const int __b)
++{
++  return __aarch64_vdup_laneq_s64 (__a, __b);
++}
+ 
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcgtzs_f32 (float32_t __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s16 (int16x4_t __a, const int __b)
++vdup_laneq_u8 (uint8x16_t __a, const int __b)
  {
 -  return __a > 0.0f ? -1 : 0;
-+  return __aarch64_vdup_lane_s16 (__a, __b);
++  return __aarch64_vdup_laneq_u8 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgtzd_s64 (int64_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s32 (int32x2_t __a, const int __b)
++vdup_laneq_u16 (uint16x8_t __a, const int __b)
  {
 -  return __a > 0 ? -1ll : 0ll;
-+  return __aarch64_vdup_lane_s32 (__a, __b);
++  return __aarch64_vdup_laneq_u16 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcgtzd_f64 (float64_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s64 (int64x1_t __a, const int __b)
++vdup_laneq_u32 (uint32x4_t __a, const int __b)
  {
 -  return __a > 0.0 ? -1ll : 0ll;
-+  return __aarch64_vdup_lane_s64 (__a, __b);
++  return __aarch64_vdup_laneq_u32 (__a, __b);
  }
  
 -/* vcle - vector.  */
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcle_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u8 (uint8x8_t __a, const int __b)
++vdup_laneq_u64 (uint64x2_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a <= __b);
-+  return __aarch64_vdup_lane_u8 (__a, __b);
++  return __aarch64_vdup_laneq_u64 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcle_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint16x4_t
++/* vdupq_lane  */
++
++__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u16 (uint16x4_t __a, const int __b)
++vdupq_lane_f16 (float16x4_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a <= __b);
-+  return __aarch64_vdup_lane_u16 (__a, __b);
++  return __aarch64_vdupq_lane_f16 (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcle_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u32 (uint32x2_t __a, const int __b)
++vdupq_lane_f32 (float32x2_t __a, const int __b)
  {
 -  return (uint8x8_t) (__a <= __b);
-+  return __aarch64_vdup_lane_u32 (__a, __b);
++  return __aarch64_vdupq_lane_f32 (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcle_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u64 (uint64x1_t __a, const int __b)
++vdupq_lane_f64 (float64x1_t __a, const int __b)
  {
 -  return (uint16x4_t) (__a <= __b);
-+  return __aarch64_vdup_lane_u64 (__a, __b);
++  return __aarch64_vdupq_lane_f64 (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcle_s32 (int32x2_t __a, int32x2_t __b)
-+/* vdup_laneq  */
-+
-+__extension__ extern __inline float16x4_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_f16 (float16x8_t __a, const int __b)
++vdupq_lane_p8 (poly8x8_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_f16 (__a, __b);
++  return __aarch64_vdupq_lane_p8 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcle_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_f32 (float32x4_t __a, const int __b)
++vdupq_lane_p16 (poly16x4_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_f32 (__a, __b);
++  return __aarch64_vdupq_lane_p16 (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcle_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_f64 (float64x2_t __a, const int __b)
++vdupq_lane_p64 (poly64x1_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdup_laneq_f64 (__a, __b);
++  return __aarch64_vdupq_lane_p64 (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcle_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_p8 (poly8x16_t __a, const int __b)
++vdupq_lane_s8 (int8x8_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdup_laneq_p8 (__a, __b);
++  return __aarch64_vdupq_lane_s8 (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcle_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline poly16x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_p16 (poly16x8_t __a, const int __b)
++vdupq_lane_s16 (int16x4_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdup_laneq_p16 (__a, __b);
++  return __aarch64_vdupq_lane_s16 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcle_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s8 (int8x16_t __a, const int __b)
++vdupq_lane_s32 (int32x2_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdup_laneq_s8 (__a, __b);
++  return __aarch64_vdupq_lane_s32 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcleq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s16 (int16x8_t __a, const int __b)
++vdupq_lane_s64 (int64x1_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_s16 (__a, __b);
++  return __aarch64_vdupq_lane_s64 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcleq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s32 (int32x4_t __a, const int __b)
++vdupq_lane_u8 (uint8x8_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_s32 (__a, __b);
++  return __aarch64_vdupq_lane_u8 (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcleq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s64 (int64x2_t __a, const int __b)
++vdupq_lane_u16 (uint16x4_t __a, const int __b)
  {
 -  return (uint8x16_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_s64 (__a, __b);
++  return __aarch64_vdupq_lane_u16 (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcleq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u8 (uint8x16_t __a, const int __b)
++vdupq_lane_u32 (uint32x2_t __a, const int __b)
  {
 -  return (uint16x8_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_u8 (__a, __b);
++  return __aarch64_vdupq_lane_u32 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcleq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u16 (uint16x8_t __a, const int __b)
++vdupq_lane_u64 (uint64x1_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_u16 (__a, __b);
++  return __aarch64_vdupq_lane_u64 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcleq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint32x2_t
++/* vdupq_laneq  */
++
++__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u32 (uint32x4_t __a, const int __b)
++vdupq_laneq_f16 (float16x8_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_u32 (__a, __b);
++  return __aarch64_vdupq_laneq_f16 (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u64 (uint64x2_t __a, const int __b)
++vdupq_laneq_f32 (float32x4_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdup_laneq_u64 (__a, __b);
++  return __aarch64_vdupq_laneq_f32 (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
-+/* vdupq_lane  */
-+
-+__extension__ extern __inline float16x8_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f16 (float16x4_t __a, const int __b)
++vdupq_laneq_f64 (float64x2_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdupq_lane_f16 (__a, __b);
++  return __aarch64_vdupq_laneq_f64 (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f32 (float32x2_t __a, const int __b)
++vdupq_laneq_p8 (poly8x16_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdupq_lane_f32 (__a, __b);
++  return __aarch64_vdupq_laneq_p8 (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f64 (float64x1_t __a, const int __b)
++vdupq_laneq_p16 (poly16x8_t __a, const int __b)
  {
 -  return (__a <= __b);
-+  return __aarch64_vdupq_lane_f64 (__a, __b);
++  return __aarch64_vdupq_laneq_p16 (__a, __b);
  }
  
 -/* vcle - scalar.  */
 -
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vcles_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_p8 (poly8x8_t __a, const int __b)
++vdupq_laneq_p64 (poly64x2_t __a, const int __b)
  {
 -  return __a <= __b ? -1 : 0;
-+  return __aarch64_vdupq_lane_p8 (__a, __b);
++  return __aarch64_vdupq_laneq_p64 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcled_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline poly16x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_p16 (poly16x4_t __a, const int __b)
++vdupq_laneq_s8 (int8x16_t __a, const int __b)
  {
 -  return __a <= __b ? -1ll : 0ll;
-+  return __aarch64_vdupq_lane_p16 (__a, __b);
++  return __aarch64_vdupq_laneq_s8 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcled_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s8 (int8x8_t __a, const int __b)
++vdupq_laneq_s16 (int16x8_t __a, const int __b)
  {
 -  return __a <= __b ? -1ll : 0ll;
-+  return __aarch64_vdupq_lane_s8 (__a, __b);
++  return __aarch64_vdupq_laneq_s16 (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcled_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s16 (int16x4_t __a, const int __b)
++vdupq_laneq_s32 (int32x4_t __a, const int __b)
  {
 -  return __a <= __b ? -1ll : 0ll;
-+  return __aarch64_vdupq_lane_s16 (__a, __b);
++  return __aarch64_vdupq_laneq_s32 (__a, __b);
  }
  
 -/* vclez - vector.  */
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vclez_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s32 (int32x2_t __a, const int __b)
++vdupq_laneq_s64 (int64x2_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a <= 0.0f);
-+  return __aarch64_vdupq_lane_s32 (__a, __b);
++  return __aarch64_vdupq_laneq_s64 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vclez_f64 (float64x1_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s64 (int64x1_t __a, const int __b)
++vdupq_laneq_u8 (uint8x16_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
-+  return __aarch64_vdupq_lane_s64 (__a, __b);
++  return __aarch64_vdupq_laneq_u8 (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vclez_s8 (int8x8_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u8 (uint8x8_t __a, const int __b)
++vdupq_laneq_u16 (uint16x8_t __a, const int __b)
  {
 -  return (uint8x8_t) (__a <= 0);
-+  return __aarch64_vdupq_lane_u8 (__a, __b);
++  return __aarch64_vdupq_laneq_u16 (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vclez_s16 (int16x4_t __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u16 (uint16x4_t __a, const int __b)
++vdupq_laneq_u32 (uint32x4_t __a, const int __b)
  {
 -  return (uint16x4_t) (__a <= 0);
-+  return __aarch64_vdupq_lane_u16 (__a, __b);
++  return __aarch64_vdupq_laneq_u32 (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vclez_s32 (int32x2_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u32 (uint32x2_t __a, const int __b)
++vdupq_laneq_u64 (uint64x2_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a <= 0);
-+  return __aarch64_vdupq_lane_u32 (__a, __b);
++  return __aarch64_vdupq_laneq_u64 (__a, __b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vclez_s64 (int64x1_t __a)
-+__extension__ extern __inline uint64x2_t
++/* vdupb_lane  */
++__extension__ extern __inline poly8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u64 (uint64x1_t __a, const int __b)
++vdupb_lane_p8 (poly8x8_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
-+  return __aarch64_vdupq_lane_u64 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vclezq_f32 (float32x4_t __a)
-+/* vdupq_laneq  */
-+
-+__extension__ extern __inline float16x8_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_f16 (float16x8_t __a, const int __b)
++vdupb_lane_s8 (int8x8_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a <= 0.0f);
-+  return __aarch64_vdupq_laneq_f16 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vclezq_f64 (float64x2_t __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_f32 (float32x4_t __a, const int __b)
++vdupb_lane_u8 (uint8x8_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a <= 0.0);
-+  return __aarch64_vdupq_laneq_f32 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vclezq_s8 (int8x16_t __a)
-+__extension__ extern __inline float64x2_t
++/* vduph_lane  */
++
++__extension__ extern __inline float16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_f64 (float64x2_t __a, const int __b)
++vduph_lane_f16 (float16x4_t __a, const int __b)
  {
 -  return (uint8x16_t) (__a <= 0);
-+  return __aarch64_vdupq_laneq_f64 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vclezq_s16 (int16x8_t __a)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline poly16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_p8 (poly8x16_t __a, const int __b)
++vduph_lane_p16 (poly16x4_t __a, const int __b)
  {
 -  return (uint16x8_t) (__a <= 0);
-+  return __aarch64_vdupq_laneq_p8 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vclezq_s32 (int32x4_t __a)
-+__extension__ extern __inline poly16x8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_p16 (poly16x8_t __a, const int __b)
++vduph_lane_s16 (int16x4_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a <= 0);
-+  return __aarch64_vdupq_laneq_p16 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vclezq_s64 (int64x2_t __a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s8 (int8x16_t __a, const int __b)
++vduph_lane_u16 (uint16x4_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
-+  return __aarch64_vdupq_laneq_s8 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -/* vclez - scalar.  */
--
++/* vdups_lane  */
+ 
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vclezs_f32 (float32_t __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s16 (int16x8_t __a, const int __b)
++vdups_lane_f32 (float32x2_t __a, const int __b)
  {
 -  return __a <= 0.0f ? -1 : 0;
-+  return __aarch64_vdupq_laneq_s16 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vclezd_s64 (int64_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s32 (int32x4_t __a, const int __b)
++vdups_lane_s32 (int32x2_t __a, const int __b)
  {
 -  return __a <= 0 ? -1ll : 0ll;
-+  return __aarch64_vdupq_laneq_s32 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vclezd_f64 (float64_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s64 (int64x2_t __a, const int __b)
++vdups_lane_u32 (uint32x2_t __a, const int __b)
  {
 -  return __a <= 0.0 ? -1ll : 0ll;
-+  return __aarch64_vdupq_laneq_s64 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -/* vclt - vector.  */
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vclt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint8x16_t
++/* vdupd_lane  */
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u8 (uint8x16_t __a, const int __b)
++vdupd_lane_f64 (float64x1_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a < __b);
-+  return __aarch64_vdupq_laneq_u8 (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __b);
++  return __a[0];
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vclt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u16 (uint16x8_t __a, const int __b)
++vdupd_lane_s64 (int64x1_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a < __b);
-+  return __aarch64_vdupq_laneq_u16 (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __b);
++  return __a[0];
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vclt_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u32 (uint32x4_t __a, const int __b)
++vdupd_lane_u64 (uint64x1_t __a, const int __b)
  {
 -  return (uint8x8_t) (__a < __b);
-+  return __aarch64_vdupq_laneq_u32 (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __b);
++  return __a[0];
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vclt_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint64x2_t
++/* vdupb_laneq  */
++__extension__ extern __inline poly8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u64 (uint64x2_t __a, const int __b)
++vdupb_laneq_p8 (poly8x16_t __a, const int __b)
  {
 -  return (uint16x4_t) (__a < __b);
-+  return __aarch64_vdupq_laneq_u64 (__a, __b);
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vclt_s32 (int32x2_t __a, int32x2_t __b)
-+/* vdupb_lane  */
-+__extension__ extern __inline poly8_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_lane_p8 (poly8x8_t __a, const int __b)
++vdupb_laneq_s8 (int8x16_t __a, const int __b)
  {
 -  return (uint32x2_t) (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25557,9 +27239,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vclt_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_lane_s8 (int8x8_t __a, const int __b)
++vdupb_laneq_u8 (uint8x16_t __a, const int __b)
  {
 -  return (uint64x1_t) (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25567,21 +27249,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vclt_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_lane_u8 (uint8x8_t __a, const int __b)
- {
+-{
 -  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
+-}
++/* vduph_laneq  */
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vclt_u16 (uint16x4_t __a, uint16x4_t __b)
-+/* vduph_lane  */
-+
 +__extension__ extern __inline float16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_f16 (float16x4_t __a, const int __b)
++vduph_laneq_f16 (float16x8_t __a, const int __b)
  {
 -  return (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25591,7 +27268,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vclt_u32 (uint32x2_t __a, uint32x2_t __b)
 +__extension__ extern __inline poly16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_p16 (poly16x4_t __a, const int __b)
++vduph_laneq_p16 (poly16x8_t __a, const int __b)
  {
 -  return (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25601,7 +27278,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vclt_u64 (uint64x1_t __a, uint64x1_t __b)
 +__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_s16 (int16x4_t __a, const int __b)
++vduph_laneq_s16 (int16x8_t __a, const int __b)
  {
 -  return (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25611,7 +27288,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vcltq_f32 (float32x4_t __a, float32x4_t __b)
 +__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_u16 (uint16x4_t __a, const int __b)
++vduph_laneq_u16 (uint16x8_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25619,11 +27296,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcltq_f64 (float64x2_t __a, float64x2_t __b)
-+/* vdups_lane  */
++/* vdups_laneq  */
 +
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_lane_f32 (float32x2_t __a, const int __b)
++vdups_laneq_f32 (float32x4_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25633,7 +27310,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vcltq_s8 (int8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_lane_s32 (int32x2_t __a, const int __b)
++vdups_laneq_s32 (int32x4_t __a, const int __b)
  {
 -  return (uint8x16_t) (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25643,7 +27320,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vcltq_s16 (int16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_lane_u32 (uint32x2_t __a, const int __b)
++vdups_laneq_u32 (uint32x4_t __a, const int __b)
  {
 -  return (uint16x8_t) (__a < __b);
 +  return __aarch64_vget_lane_any (__a, __b);
@@ -25651,339 +27328,214 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcltq_s32 (int32x4_t __a, int32x4_t __b)
-+/* vdupd_lane  */
++/* vdupd_laneq  */
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_lane_f64 (float64x1_t __a, const int __b)
++vdupd_laneq_f64 (float64x2_t __a, const int __b)
  {
 -  return (uint32x4_t) (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __b);
-+  return __a[0];
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcltq_s64 (int64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_lane_s64 (int64x1_t __a, const int __b)
++vdupd_laneq_s64 (int64x2_t __a, const int __b)
  {
 -  return (uint64x2_t) (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __b);
-+  return __a[0];
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_lane_u64 (uint64x1_t __a, const int __b)
++vdupd_laneq_u64 (uint64x2_t __a, const int __b)
  {
 -  return (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __b);
-+  return __a[0];
++  return __aarch64_vget_lane_any (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
-+/* vdupb_laneq  */
-+__extension__ extern __inline poly8_t
++/* vext  */
++
++__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_laneq_p8 (poly8x16_t __a, const int __b)
++vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
  {
 -  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a,
++			    (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
++#else
++  return __builtin_shuffle (__a, __b,
++			    (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
++#endif
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_laneq_s8 (int8x16_t __a, const int __b)
++vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
  {
 -  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
++#else
++  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
++#endif
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_laneq_u8 (uint8x16_t __a, const int __b)
++vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
  {
 -  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++  /* The only possible index to the assembler instruction returns element 0.  */
++  return __a;
  }
- 
+-
 -/* vclt - scalar.  */
-+/* vduph_laneq  */
- 
+-
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vclts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float16_t
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_f16 (float16x8_t __a, const int __b)
++vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
  {
 -  return __a < __b ? -1 : 0;
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a, (uint8x8_t)
++      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
++#else
++  return __builtin_shuffle (__a, __b,
++      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
++#endif
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcltd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline poly16_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_p16 (poly16x8_t __a, const int __b)
++vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
  {
 -  return __a < __b ? -1ll : 0ll;
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a,
++      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
++#else
++  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
++#endif
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcltd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_s16 (int16x8_t __a, const int __b)
++vext_p64 (poly64x1_t __a, poly64x1_t __b, __const int __c)
  {
 -  return __a < __b ? -1ll : 0ll;
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++  /* The only possible index to the assembler instruction returns element 0.  */
++  return __a;
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vcltd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_u16 (uint16x8_t __a, const int __b)
++vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
  {
 -  return __a < __b ? -1ll : 0ll;
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a, (uint8x8_t)
++      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
++#else
++  return __builtin_shuffle (__a, __b,
++      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
++#endif
  }
  
 -/* vcltz - vector.  */
-+/* vdups_laneq  */
- 
+-
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcltz_f32 (float32x2_t __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_laneq_f32 (float32x4_t __a, const int __b)
++vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
  {
 -  return (uint32x2_t) (__a < 0.0f);
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a,
++      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
++#else
++  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
++#endif
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vcltz_f64 (float64x1_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_laneq_s32 (int32x4_t __a, const int __b)
++vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
  {
 -  return (uint64x1_t) (__a < (float64x1_t) {0.0});
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
++#else
++  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
++#endif
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vcltz_s8 (int8x8_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_laneq_u32 (uint32x4_t __a, const int __b)
++vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
  {
 -  return (uint8x8_t) (__a < 0);
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++  /* The only possible index to the assembler instruction returns element 0.  */
++  return __a;
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vcltz_s16 (int16x4_t __a)
-+/* vdupd_laneq  */
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_laneq_f64 (float64x2_t __a, const int __b)
++vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
  {
 -  return (uint16x4_t) (__a < 0);
-+  return __aarch64_vget_lane_any (__a, __b);
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a, (uint8x8_t)
++      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
++#else
++  return __builtin_shuffle (__a, __b,
++      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
++#endif
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vcltz_s32 (int32x2_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_laneq_s64 (int64x2_t __a, const int __b)
++vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
  {
 -  return (uint32x2_t) (__a < 0);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcltz_s64 (int64x1_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_laneq_u64 (uint64x2_t __a, const int __b)
- {
--  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltzq_f32 (float32x4_t __a)
-+/* vext  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
- {
--  return (uint32x4_t) (__a < 0.0f);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+			    (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
-+#endif
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltzq_f64 (float64x2_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
- {
--  return (uint64x2_t) (__a < 0.0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcltzq_s8 (int8x16_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
- {
--  return (uint8x16_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+  /* The only possible index to the assembler instruction returns element 0.  */
-+  return __a;
- }
--
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcltzq_s16 (int16x8_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
- {
--  return (uint16x8_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltzq_s32 (int32x4_t __a)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
- {
--  return (uint32x4_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltzq_s64 (int64x2_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
- {
--  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--/* vcltz - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcltzs_f32 (float32_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
- {
--  return __a < 0.0f ? -1 : 0;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltzd_s64 (int64_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
- {
--  return __a < 0 ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltzd_f64 (float64_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
- {
--  return __a < 0.0 ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+  /* The only possible index to the assembler instruction returns element 0.  */
-+  return __a;
- }
- 
--/* vcls.  */
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
-+{
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vcls_s8 (int8x8_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv8qi (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a,
@@ -25993,13 +27545,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vcls_s16 (int16x4_t __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vcltz_s64 (int64x1_t __a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
  {
--  return __builtin_aarch64_clrsbv4hi (__a);
+-  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
@@ -26008,25 +27560,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcls_s32 (int32x2_t __a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vcltzq_f32 (float32x4_t __a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
  {
--  return __builtin_aarch64_clrsbv2si (__a);
+-  return (uint32x4_t) (__a < 0.0f);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +  /* The only possible index to the assembler instruction returns element 0.  */
 +  return __a;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vclsq_s8 (int8x16_t __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vcltzq_f64 (float64x2_t __a)
 +__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_f16 (float16x8_t __a, float16x8_t __b, __const int __c)
  {
--  return __builtin_aarch64_clrsbv16qi (__a);
+-  return (uint64x2_t) (__a < 0.0);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a,
@@ -26040,13 +27592,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vclsq_s16 (int16x8_t __a)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vcltzq_s8 (int8x16_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
  {
--  return __builtin_aarch64_clrsbv8hi (__a);
+-  return (uint8x16_t) (__a < 0);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a,
@@ -26056,13 +27608,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vclsq_s32 (int32x4_t __a)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vcltzq_s16 (int16x8_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
  {
--  return __builtin_aarch64_clrsbv4si (__a);
+-  return (uint16x8_t) (__a < 0);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
@@ -26071,11 +27623,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--/* vclz.  */
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vcltzq_s32 (int32x4_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
-+{
+ {
+-  return (uint32x4_t) (__a < 0);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint8x16_t)
@@ -26086,15 +27640,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
 +       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
 +#endif
-+}
+ }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vclz_s8 (int8x8_t __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vcltzq_s64 (int64x2_t __a)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
  {
--  return __builtin_aarch64_clzv8qi (__a);
+-  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint16x8_t)
@@ -26105,13 +27659,30 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vclz_s16 (int16x4_t __a)
+-/* vcltz - scalar.  */
+-
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vcltzs_f32 (float32_t __a)
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vextq_p64 (poly64x2_t __a, poly64x2_t __b, __const int __c)
+ {
+-  return __a < 0.0f ? -1 : 0;
++  __AARCH64_LANE_CHECK (__a, __c);
++#ifdef __AARCH64EB__
++  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
++#else
++  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
++#endif
+ }
+ 
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcltzd_s64 (int64_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
  {
--  return __builtin_aarch64_clzv4hi (__a);
+-  return __a < 0 ? -1ll : 0ll;
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint8x16_t)
@@ -26124,13 +27695,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vclz_s32 (int32x2_t __a)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcltzd_f64 (float64_t __a)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
  {
--  return __builtin_aarch64_clzv2si (__a);
+-  return __a < 0.0 ? -1ll : 0ll;
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint16x8_t)
@@ -26141,13 +27712,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vclz_u8 (uint8x8_t __a)
+-/* vcls.  */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vcls_s8 (int8x8_t __a)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
  {
--  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
+-  return __builtin_aarch64_clrsbv8qi (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a,
@@ -26157,13 +27730,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vclz_u16 (uint16x4_t __a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vcls_s16 (int16x4_t __a)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
  {
--  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
+-  return __builtin_aarch64_clrsbv4hi (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
@@ -26172,13 +27745,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclz_u32 (uint32x2_t __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vcls_s32 (int32x2_t __a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
  {
--  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
+-  return __builtin_aarch64_clrsbv2si (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint8x16_t)
@@ -26192,12 +27765,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vclzq_s8 (int8x16_t __a)
+-vclsq_s8 (int8x16_t __a)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
  {
--  return __builtin_aarch64_clzv16qi (__a);
+-  return __builtin_aarch64_clrsbv16qi (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint16x8_t)
@@ -26209,12 +27782,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vclzq_s16 (int16x8_t __a)
+-vclsq_s16 (int16x8_t __a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
  {
--  return __builtin_aarch64_clzv8hi (__a);
+-  return __builtin_aarch64_clrsbv8hi (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a,
@@ -26225,12 +27798,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vclzq_s32 (int32x4_t __a)
+-vclsq_s32 (int32x4_t __a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
  {
--  return __builtin_aarch64_clzv4si (__a);
+-  return __builtin_aarch64_clrsbv4si (__a);
 +  __AARCH64_LANE_CHECK (__a, __c);
 +#ifdef __AARCH64EB__
 +  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
@@ -26239,487 +27812,455 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vclzq_u8 (uint8x16_t __a)
--{
--  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
--}
+-/* vclz.  */
 +/* vfma  */
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vclzq_u16 (uint16x8_t __a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vclz_s8 (int8x8_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
  {
--  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
+-  return __builtin_aarch64_clzv8qi (__a);
 +  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vclzq_u32 (uint32x4_t __a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vclz_s16 (int16x4_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
  {
--  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
+-  return __builtin_aarch64_clzv4hi (__a);
 +  return __builtin_aarch64_fmav2sf (__b, __c, __a);
  }
  
--/* vcnt.  */
--
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vcnt_p8 (poly8x8_t __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vclz_s32 (int32x2_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
  {
--  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
+-  return __builtin_aarch64_clzv2si (__a);
 +  return __builtin_aarch64_fmav4sf (__b, __c, __a);
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vcnt_s8 (int8x8_t __a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vclz_u8 (uint8x8_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
  {
--  return __builtin_aarch64_popcountv8qi (__a);
+-  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
 +  return __builtin_aarch64_fmav2df (__b, __c, __a);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcnt_u8 (uint8x8_t __a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vclz_u16 (uint16x4_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
  {
--  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
+-  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
 +  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vcntq_p8 (poly8x16_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vclz_u32 (uint32x2_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
  {
--  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
+-  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
 +  return (float64x1_t) {__b[0] * __c + __a[0]};
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vcntq_s8 (int8x16_t __a)
+-vclzq_s8 (int8x16_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
  {
--  return __builtin_aarch64_popcountv16qi (__a);
+-  return __builtin_aarch64_clzv16qi (__a);
 +  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcntq_u8 (uint8x16_t __a)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vclzq_s16 (int16x8_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
  {
--  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
+-  return __builtin_aarch64_clzv8hi (__a);
 +  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
  }
  
--/* vcvt (double -> float).  */
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vclzq_s32 (int32x4_t __a)
 +/* vfma_lane  */
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vcvt_f16_f32 (float32x4_t __a)
++
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
 +	       float32x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
+-  return __builtin_aarch64_clzv4si (__a);
 +  return __builtin_aarch64_fmav2sf (__b,
 +				    __aarch64_vdup_lane_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vclzq_u8 (uint8x16_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
 +	       float64x1_t __c, const int __lane)
  {
--  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
+-  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
 +  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvt_f32_f64 (float64x2_t __a)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vclzq_u16 (uint16x8_t __a)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmad_lane_f64 (float64_t __a, float64_t __b,
 +	        float64x1_t __c, const int __lane)
  {
--  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
+-  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
 +  return __builtin_fma (__b, __c[0], __a);
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vclzq_u32 (uint32x4_t __a)
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmas_lane_f32 (float32_t __a, float32_t __b,
 +	        float32x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
+-  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
 +  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
  }
  
--/* vcvt (float -> double).  */
+-/* vcnt.  */
 +/* vfma_laneq  */
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvt_f32_f16 (float16x4_t __a)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vcnt_p8 (poly8x8_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
 +	        float32x4_t __c, const int __lane)
  {
--  return __builtin_aarch64_float_extend_lo_v4sf (__a);
+-  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
 +  return __builtin_aarch64_fmav2sf (__b,
 +				    __aarch64_vdup_laneq_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvt_f64_f32 (float32x2_t __a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vcnt_s8 (int8x8_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
 +	        float64x2_t __c, const int __lane)
  {
--
--  return __builtin_aarch64_float_extend_lo_v2df (__a);
+-  return __builtin_aarch64_popcountv8qi (__a);
 +  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
 +  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvt_high_f32_f16 (float16x8_t __a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vcnt_u8 (uint8x8_t __a)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmad_laneq_f64 (float64_t __a, float64_t __b,
 +	         float64x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
+-  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
 +  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvt_high_f64_f32 (float32x4_t __a)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vcntq_p8 (poly8x16_t __a)
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmas_laneq_f32 (float32_t __a, float32_t __b,
 +		 float32x4_t __c, const int __lane)
  {
--  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
+-  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
 +  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
  }
  
--/* vcvt  (<u>int -> float)  */
--
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vcvtd_f64_s64 (int64_t __a)
--{
--  return (float64_t) __a;
--}
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vcntq_s8 (int8x16_t __a)
 +/* vfmaq_lane  */
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vcvtd_f64_u64 (uint64_t __a)
++
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
 +	        float32x2_t __c, const int __lane)
  {
--  return (float64_t) __a;
+-  return __builtin_aarch64_popcountv16qi (__a);
 +  return __builtin_aarch64_fmav4sf (__b,
 +				    __aarch64_vdupq_lane_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vcvts_f32_s32 (int32_t __a)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vcntq_u8 (uint8x16_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
 +	        float64x1_t __c, const int __lane)
  {
--  return (float32_t) __a;
+-  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
 +  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
  }
  
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vcvts_f32_u32 (uint32_t __a)
--{
--  return (float32_t) __a;
--}
+-/* vcvt (double -> float).  */
 +/* vfmaq_laneq  */
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvt_f32_s32 (int32x2_t __a)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+-vcvt_f16_f32 (float32x4_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
 +	         float32x4_t __c, const int __lane)
  {
--  return __builtin_aarch64_floatv2siv2sf (__a);
+-  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
 +  return __builtin_aarch64_fmav4sf (__b,
 +				    __aarch64_vdupq_laneq_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvt_f32_u32 (uint32x2_t __a)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+-vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
 +	         float64x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
+-  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
 +  return __builtin_aarch64_fmav2df (__b,
 +				    __aarch64_vdupq_laneq_f64 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvtq_f32_s32 (int32x4_t __a)
--{
--  return __builtin_aarch64_floatv4siv4sf (__a);
--}
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vcvt_f32_f64 (float64x2_t __a)
 +/* vfms  */
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvtq_f32_u32 (uint32x4_t __a)
++
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
  {
--  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
+-  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
 +  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvtq_f64_s64 (int64x2_t __a)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
  {
--  return __builtin_aarch64_floatv2div2df (__a);
+-  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
 +  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvtq_f64_u64 (uint64x2_t __a)
+-/* vcvt (float -> double).  */
+-
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvt_f32_f16 (float16x4_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
  {
--  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
+-  return __builtin_aarch64_float_extend_lo_v4sf (__a);
 +  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
  }
  
--/* vcvt (float -> <u>int)  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtd_s64_f64 (float64_t __a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vcvt_f64_f32 (float32x2_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
  {
--  return (int64_t) __a;
+-
+-  return __builtin_aarch64_float_extend_lo_v2df (__a);
 +  return __builtin_aarch64_fmav2df (-__b, __c, __a);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtd_u64_f64 (float64_t __a)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvt_high_f32_f16 (float16x8_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
  {
--  return (uint64_t) __a;
+-  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
 +  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvts_s32_f32 (float32_t __a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vcvt_high_f64_f32 (float32x4_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
  {
--  return (int32_t) __a;
+-  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
 +  return (float64x1_t) {-__b[0] * __c + __a[0]};
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvts_u32_f32 (float32_t __a)
+-/* vcvt  (<u>int -> float)  */
+-
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vcvtd_f64_s64 (int64_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
  {
--  return (uint32_t) __a;
+-  return (float64_t) __a;
 +  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvt_s32_f32 (float32x2_t __a)
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vcvtd_f64_u64 (uint64_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
  {
--  return __builtin_aarch64_lbtruncv2sfv2si (__a);
+-  return (float64_t) __a;
 +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvt_u32_f32 (float32x2_t __a)
--{
--  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
--}
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vcvts_f32_s32 (int32_t __a)
 +/* vfms_lane  */
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtq_s32_f32 (float32x4_t __a)
++
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
 +	       float32x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_lbtruncv4sfv4si (__a);
+-  return (float32_t) __a;
 +  return __builtin_aarch64_fmav2sf (-__b,
 +				    __aarch64_vdup_lane_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtq_u32_f32 (float32x4_t __a)
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vcvts_f32_u32 (uint32_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
 +	       float64x1_t __c, const int __lane)
  {
--  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
+-  return (float32_t) __a;
 +  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvt_s64_f64 (float64x1_t __a)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vcvt_f32_s32 (int32x2_t __a)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsd_lane_f64 (float64_t __a, float64_t __b,
 +	        float64x1_t __c, const int __lane)
  {
--  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
+-  return __builtin_aarch64_floatv2siv2sf (__a);
 +  return __builtin_fma (-__b, __c[0], __a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvt_u64_f64 (float64x1_t __a)
-+__extension__ extern __inline float32_t
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vcvt_f32_u32 (uint32x2_t __a)
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmss_lane_f32 (float32_t __a, float32_t __b,
 +	        float32x2_t __c, const int __lane)
  {
--  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
+-  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
 +  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtq_s64_f64 (float64x2_t __a)
--{
--  return __builtin_aarch64_lbtruncv2dfv2di (__a);
--}
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvtq_f32_s32 (int32x4_t __a)
 +/* vfms_laneq  */
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtq_u64_f64 (float64x2_t __a)
++
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
 +	        float32x4_t __c, const int __lane)
  {
--  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
+-  return __builtin_aarch64_floatv4siv4sf (__a);
 +  return __builtin_aarch64_fmav2sf (-__b,
 +				    __aarch64_vdup_laneq_f32 (__c, __lane),
 +				    __a);
  }
  
--/* vcvta  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtad_s64_f64 (float64_t __a)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvtq_f32_u32 (uint32x4_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
 +	        float64x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_lrounddfdi (__a);
+-  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
 +  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
 +  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtad_u64_f64 (float64_t __a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vcvtq_f64_s64 (int64x2_t __a)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsd_laneq_f64 (float64_t __a, float64_t __b,
 +	         float64x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_lroundudfdi_us (__a);
+-  return __builtin_aarch64_floatv2div2df (__a);
 +  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtas_s32_f32 (float32_t __a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vcvtq_f64_u64 (uint64x2_t __a)
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmss_laneq_f32 (float32_t __a, float32_t __b,
 +		 float32x4_t __c, const int __lane)
  {
--  return __builtin_aarch64_lroundsfsi (__a);
+-  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
 +  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtas_u32_f32 (float32_t __a)
+-/* vcvt (float -> <u>int)  */
 +/* vfmsq_lane  */
-+
+ 
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vcvtd_s64_f64 (float64_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
 +	        float32x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_lroundusfsi_us (__a);
+-  return (int64_t) __a;
 +  return __builtin_aarch64_fmav4sf (-__b,
 +				    __aarch64_vdupq_lane_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvta_s32_f32 (float32x2_t __a)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcvtd_u64_f64 (float64_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
 +	        float64x1_t __c, const int __lane)
  {
--  return __builtin_aarch64_lroundv2sfv2si (__a);
+-  return (uint64_t) __a;
 +  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvta_u32_f32 (float32x2_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vcvts_s32_f32 (float32_t __a)
 +/* vfmsq_laneq  */
 +
 +__extension__ extern __inline float32x4_t
@@ -26727,853 +28268,914 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
 +	         float32x4_t __c, const int __lane)
  {
--  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
+-  return (int32_t) __a;
 +  return __builtin_aarch64_fmav4sf (-__b,
 +				    __aarch64_vdupq_laneq_f32 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtaq_s32_f32 (float32x4_t __a)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vcvts_u32_f32 (float32_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
 +	         float64x2_t __c, const int __lane)
  {
--  return __builtin_aarch64_lroundv4sfv4si (__a);
+-  return (uint32_t) __a;
 +  return __builtin_aarch64_fmav2df (-__b,
 +				    __aarch64_vdupq_laneq_f64 (__c, __lane),
 +				    __a);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtaq_u32_f32 (float32x4_t __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vcvt_s32_f32 (float32x2_t __a)
 +/* vld1 */
 +
 +__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_f16 (const float16_t *__a)
  {
--  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
+-  return __builtin_aarch64_lbtruncv2sfv2si (__a);
 +  return __builtin_aarch64_ld1v4hf (__a);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvta_s64_f64 (float64x1_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vcvt_u32_f32 (float32x2_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_f32 (const float32_t *a)
  {
--  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
+-  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
 +  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvta_u64_f64 (float64x1_t __a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vcvtq_s32_f32 (float32x4_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_f64 (const float64_t *a)
  {
--  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
+-  return __builtin_aarch64_lbtruncv4sfv4si (__a);
 +  return (float64x1_t) {*a};
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtaq_s64_f64 (float64x2_t __a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vcvtq_u32_f32 (float32x4_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_p8 (const poly8_t *a)
  {
--  return __builtin_aarch64_lroundv2dfv2di (__a);
+-  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
 +  return (poly8x8_t)
 +    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtaq_u64_f64 (float64x2_t __a)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vcvt_s64_f64 (float64x1_t __a)
 +__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_p16 (const poly16_t *a)
  {
--  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
+-  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
 +  return (poly16x4_t)
 +    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
  }
  
--/* vcvtm  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtmd_s64_f64 (float64_t __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vcvt_u64_f64 (float64x1_t __a)
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_p64 (const poly64_t *a)
+ {
+-  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
++  return (poly64x1_t) {*a};
+ }
+ 
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vcvtq_s64_f64 (float64x2_t __a)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_s8 (const int8_t *a)
  {
--  return __builtin_llfloor (__a);
+-  return __builtin_aarch64_lbtruncv2dfv2di (__a);
 +  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtmd_u64_f64 (float64_t __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vcvtq_u64_f64 (float64x2_t __a)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_s16 (const int16_t *a)
  {
--  return __builtin_aarch64_lfloorudfdi_us (__a);
+-  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
 +  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtms_s32_f32 (float32_t __a)
+-/* vcvta  */
+-
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vcvtad_s64_f64 (float64_t __a)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_s32 (const int32_t *a)
  {
--  return __builtin_ifloorf (__a);
+-  return __builtin_aarch64_lrounddfdi (__a);
 +  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtms_u32_f32 (float32_t __a)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcvtad_u64_f64 (float64_t __a)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_s64 (const int64_t *a)
  {
--  return __builtin_aarch64_lfloorusfsi_us (__a);
+-  return __builtin_aarch64_lroundudfdi_us (__a);
 +  return (int64x1_t) {*a};
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvtm_s32_f32 (float32x2_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vcvtas_s32_f32 (float32_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_u8 (const uint8_t *a)
  {
--  return __builtin_aarch64_lfloorv2sfv2si (__a);
+-  return __builtin_aarch64_lroundsfsi (__a);
 +  return (uint8x8_t)
 +    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvtm_u32_f32 (float32x2_t __a)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vcvtas_u32_f32 (float32_t __a)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_u16 (const uint16_t *a)
  {
--  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
+-  return __builtin_aarch64_lroundusfsi_us (__a);
 +  return (uint16x4_t)
 +    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtmq_s32_f32 (float32x4_t __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vcvta_s32_f32 (float32x2_t __a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_u32 (const uint32_t *a)
  {
--  return __builtin_aarch64_lfloorv4sfv4si (__a);
+-  return __builtin_aarch64_lroundv2sfv2si (__a);
 +  return (uint32x2_t)
 +    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtmq_u32_f32 (float32x4_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vcvta_u32_f32 (float32x2_t __a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_u64 (const uint64_t *a)
  {
--  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
+-  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
 +  return (uint64x1_t) {*a};
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvtm_s64_f64 (float64x1_t __a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vcvtaq_s32_f32 (float32x4_t __a)
 +/* vld1q */
 +
 +__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_f16 (const float16_t *__a)
  {
--  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
+-  return __builtin_aarch64_lroundv4sfv4si (__a);
 +  return __builtin_aarch64_ld1v8hf (__a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvtm_u64_f64 (float64x1_t __a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vcvtaq_u32_f32 (float32x4_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_f32 (const float32_t *a)
  {
--  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
+-  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
 +  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtmq_s64_f64 (float64x2_t __a)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vcvta_s64_f64 (float64x1_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_f64 (const float64_t *a)
  {
--  return __builtin_aarch64_lfloorv2dfv2di (__a);
+-  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
 +  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtmq_u64_f64 (float64x2_t __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vcvta_u64_f64 (float64x1_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_p8 (const poly8_t *a)
  {
--  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
+-  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
 +  return (poly8x16_t)
 +    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
  }
  
--/* vcvtn  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtnd_s64_f64 (float64_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vcvtaq_s64_f64 (float64x2_t __a)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_p16 (const poly16_t *a)
  {
--  return __builtin_aarch64_lfrintndfdi (__a);
+-  return __builtin_aarch64_lroundv2dfv2di (__a);
 +  return (poly16x8_t)
 +    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtnd_u64_f64 (float64_t __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vcvtaq_u64_f64 (float64x2_t __a)
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_p64 (const poly64_t *a)
+ {
+-  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
++  return (poly64x2_t)
++    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+ }
+ 
+-/* vcvtm  */
+-
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vcvtmd_s64_f64 (float64_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_s8 (const int8_t *a)
  {
--  return __builtin_aarch64_lfrintnudfdi_us (__a);
+-  return __builtin_llfloor (__a);
 +  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtns_s32_f32 (float32_t __a)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcvtmd_u64_f64 (float64_t __a)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_s16 (const int16_t *a)
  {
--  return __builtin_aarch64_lfrintnsfsi (__a);
+-  return __builtin_aarch64_lfloorudfdi_us (__a);
 +  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtns_u32_f32 (float32_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vcvtms_s32_f32 (float32_t __a)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_s32 (const int32_t *a)
  {
--  return __builtin_aarch64_lfrintnusfsi_us (__a);
+-  return __builtin_ifloorf (__a);
 +  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvtn_s32_f32 (float32x2_t __a)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vcvtms_u32_f32 (float32_t __a)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_s64 (const int64_t *a)
  {
--  return __builtin_aarch64_lfrintnv2sfv2si (__a);
+-  return __builtin_aarch64_lfloorusfsi_us (__a);
 +  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvtn_u32_f32 (float32x2_t __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vcvtm_s32_f32 (float32x2_t __a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_u8 (const uint8_t *a)
  {
--  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
+-  return __builtin_aarch64_lfloorv2sfv2si (__a);
 +  return (uint8x16_t)
 +    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtnq_s32_f32 (float32x4_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vcvtm_u32_f32 (float32x2_t __a)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_u16 (const uint16_t *a)
  {
--  return __builtin_aarch64_lfrintnv4sfv4si (__a);
+-  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
 +  return (uint16x8_t)
 +    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtnq_u32_f32 (float32x4_t __a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vcvtmq_s32_f32 (float32x4_t __a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_u32 (const uint32_t *a)
  {
--  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
+-  return __builtin_aarch64_lfloorv4sfv4si (__a);
 +  return (uint32x4_t)
 +    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvtn_s64_f64 (float64x1_t __a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vcvtmq_u32_f32 (float32x4_t __a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_u64 (const uint64_t *a)
  {
--  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
+-  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
 +  return (uint64x2_t)
 +    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvtn_u64_f64 (float64x1_t __a)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vcvtm_s64_f64 (float64x1_t __a)
 +/* vld1_dup  */
 +
 +__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_f16 (const float16_t* __a)
  {
--  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
+-  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
 +  return vdup_n_f16 (*__a);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtnq_s64_f64 (float64x2_t __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vcvtm_u64_f64 (float64x1_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_f32 (const float32_t* __a)
  {
--  return __builtin_aarch64_lfrintnv2dfv2di (__a);
+-  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
 +  return vdup_n_f32 (*__a);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtnq_u64_f64 (float64x2_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vcvtmq_s64_f64 (float64x2_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_f64 (const float64_t* __a)
  {
--  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
+-  return __builtin_aarch64_lfloorv2dfv2di (__a);
 +  return vdup_n_f64 (*__a);
  }
  
--/* vcvtp  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtpd_s64_f64 (float64_t __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vcvtmq_u64_f64 (float64x2_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_p8 (const poly8_t* __a)
  {
--  return __builtin_llceil (__a);
+-  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
 +  return vdup_n_p8 (*__a);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtpd_u64_f64 (float64_t __a)
+-/* vcvtn  */
+-
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vcvtnd_s64_f64 (float64_t __a)
 +__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_p16 (const poly16_t* __a)
  {
--  return __builtin_aarch64_lceiludfdi_us (__a);
+-  return __builtin_aarch64_lfrintndfdi (__a);
 +  return vdup_n_p16 (*__a);
  }
  
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcvtnd_u64_f64 (float64_t __a)
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_dup_p64 (const poly64_t* __a)
+ {
+-  return __builtin_aarch64_lfrintnudfdi_us (__a);
++  return vdup_n_p64 (*__a);
+ }
+ 
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtps_s32_f32 (float32_t __a)
+-vcvtns_s32_f32 (float32_t __a)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_s8 (const int8_t* __a)
  {
--  return __builtin_iceilf (__a);
+-  return __builtin_aarch64_lfrintnsfsi (__a);
 +  return vdup_n_s8 (*__a);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtps_u32_f32 (float32_t __a)
+-vcvtns_u32_f32 (float32_t __a)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_s16 (const int16_t* __a)
  {
--  return __builtin_aarch64_lceilusfsi_us (__a);
+-  return __builtin_aarch64_lfrintnusfsi_us (__a);
 +  return vdup_n_s16 (*__a);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvtp_s32_f32 (float32x2_t __a)
+-vcvtn_s32_f32 (float32x2_t __a)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_s32 (const int32_t* __a)
  {
--  return __builtin_aarch64_lceilv2sfv2si (__a);
+-  return __builtin_aarch64_lfrintnv2sfv2si (__a);
 +  return vdup_n_s32 (*__a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvtp_u32_f32 (float32x2_t __a)
+-vcvtn_u32_f32 (float32x2_t __a)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_s64 (const int64_t* __a)
  {
--  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
+-  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
 +  return vdup_n_s64 (*__a);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtpq_s32_f32 (float32x4_t __a)
+-vcvtnq_s32_f32 (float32x4_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_u8 (const uint8_t* __a)
  {
--  return __builtin_aarch64_lceilv4sfv4si (__a);
+-  return __builtin_aarch64_lfrintnv4sfv4si (__a);
 +  return vdup_n_u8 (*__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtpq_u32_f32 (float32x4_t __a)
+-vcvtnq_u32_f32 (float32x4_t __a)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_u16 (const uint16_t* __a)
  {
--  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
+-  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
 +  return vdup_n_u16 (*__a);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvtp_s64_f64 (float64x1_t __a)
+-vcvtn_s64_f64 (float64x1_t __a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_u32 (const uint32_t* __a)
  {
--  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
+-  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
 +  return vdup_n_u32 (*__a);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvtp_u64_f64 (float64x1_t __a)
+-vcvtn_u64_f64 (float64x1_t __a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_dup_u64 (const uint64_t* __a)
  {
--  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
+-  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
 +  return vdup_n_u64 (*__a);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtpq_s64_f64 (float64x2_t __a)
+-vcvtnq_s64_f64 (float64x2_t __a)
 +/* vld1q_dup  */
 +
 +__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_f16 (const float16_t* __a)
  {
--  return __builtin_aarch64_lceilv2dfv2di (__a);
+-  return __builtin_aarch64_lfrintnv2dfv2di (__a);
 +  return vdupq_n_f16 (*__a);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtpq_u64_f64 (float64x2_t __a)
+-vcvtnq_u64_f64 (float64x2_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_f32 (const float32_t* __a)
  {
--  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
+-  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
 +  return vdupq_n_f32 (*__a);
  }
  
--/* vdup_n  */
+-/* vcvtp  */
 -
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vdup_n_f32 (float32_t __a)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vcvtpd_s64_f64 (float64_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_f64 (const float64_t* __a)
  {
--  return (float32x2_t) {__a, __a};
+-  return __builtin_llceil (__a);
 +  return vdupq_n_f64 (*__a);
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vdup_n_f64 (float64_t __a)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vcvtpd_u64_f64 (float64_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_p8 (const poly8_t* __a)
  {
--  return (float64x1_t) {__a};
+-  return __builtin_aarch64_lceiludfdi_us (__a);
 +  return vdupq_n_p8 (*__a);
  }
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vdup_n_p8 (poly8_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vcvtps_s32_f32 (float32_t __a)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_p16 (const poly16_t* __a)
  {
--  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+-  return __builtin_iceilf (__a);
 +  return vdupq_n_p16 (*__a);
  }
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vdup_n_p16 (poly16_t __a)
-+__extension__ extern __inline int8x16_t
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vcvtps_u32_f32 (float32_t __a)
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_dup_p64 (const poly64_t* __a)
+ {
+-  return __builtin_aarch64_lceilusfsi_us (__a);
++  return vdupq_n_p64 (*__a);
+ }
+ 
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vcvtp_s32_f32 (float32x2_t __a)
++ __extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_s8 (const int8_t* __a)
  {
--  return (poly16x4_t) {__a, __a, __a, __a};
+-  return __builtin_aarch64_lceilv2sfv2si (__a);
 +  return vdupq_n_s8 (*__a);
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vdup_n_s8 (int8_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vcvtp_u32_f32 (float32x2_t __a)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_s16 (const int16_t* __a)
  {
--  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+-  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
 +  return vdupq_n_s16 (*__a);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vdup_n_s16 (int16_t __a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vcvtpq_s32_f32 (float32x4_t __a)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_s32 (const int32_t* __a)
  {
--  return (int16x4_t) {__a, __a, __a, __a};
+-  return __builtin_aarch64_lceilv4sfv4si (__a);
 +  return vdupq_n_s32 (*__a);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vdup_n_s32 (int32_t __a)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vcvtpq_u32_f32 (float32x4_t __a)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_s64 (const int64_t* __a)
  {
--  return (int32x2_t) {__a, __a};
+-  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
 +  return vdupq_n_s64 (*__a);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vdup_n_s64 (int64_t __a)
+-vcvtp_s64_f64 (float64x1_t __a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_u8 (const uint8_t* __a)
  {
--  return (int64x1_t) {__a};
+-  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
 +  return vdupq_n_u8 (*__a);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vdup_n_u8 (uint8_t __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vcvtp_u64_f64 (float64x1_t __a)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_u16 (const uint16_t* __a)
  {
--  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+-  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
 +  return vdupq_n_u16 (*__a);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vdup_n_u16 (uint16_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vcvtpq_s64_f64 (float64x2_t __a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_u32 (const uint32_t* __a)
  {
--  return (uint16x4_t) {__a, __a, __a, __a};
+-  return __builtin_aarch64_lceilv2dfv2di (__a);
 +  return vdupq_n_u32 (*__a);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vdup_n_u32 (uint32_t __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vcvtpq_u64_f64 (float64x2_t __a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_dup_u64 (const uint64_t* __a)
  {
--  return (uint32x2_t) {__a, __a};
+-  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
 +  return vdupq_n_u64 (*__a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vdup_n_u64 (uint64_t __a)
+-/* vdup_n  */
 +/* vld1_lane  */
-+
+ 
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vdup_n_f32 (float32_t __a)
 +__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
  {
--  return (uint64x1_t) {__a};
+-  return (float32x2_t) {__a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--/* vdupq_n  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vdupq_n_f32 (float32_t __a)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vdup_n_f64 (float64_t __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
  {
--  return (float32x4_t) {__a, __a, __a, __a};
+-  return (float64x1_t) {__a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vdupq_n_f64 (float64_t __a)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vdup_n_p8 (poly8_t __a)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
  {
--  return (float64x2_t) {__a, __a};
+-  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vdupq_n_p8 (uint32_t __a)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vdup_n_p16 (poly16_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
  {
--  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
--		       __a, __a, __a, __a, __a, __a, __a, __a};
+-  return (poly16x4_t) {__a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vdupq_n_p16 (uint32_t __a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vdup_n_s8 (int8_t __a)
 +__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
  {
--  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+-  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vdupq_n_s8 (int32_t __a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vdup_n_s16 (int16_t __a)
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, const int __lane)
+ {
+-  return (int16x4_t) {__a, __a, __a, __a};
++  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+ }
+ 
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vdup_n_s32 (int32_t __a)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
  {
--  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
--		      __a, __a, __a, __a, __a, __a, __a, __a};
+-  return (int32x2_t) {__a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vdupq_n_s16 (int32_t __a)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vdup_n_s64 (int64_t __a)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
  {
--  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+-  return (int64x1_t) {__a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vdupq_n_s32 (int32_t __a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vdup_n_u8 (uint8_t __a)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
  {
--  return (int32x4_t) {__a, __a, __a, __a};
+-  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vdupq_n_s64 (int64_t __a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vdup_n_u16 (uint16_t __a)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
  {
--  return (int64x2_t) {__a, __a};
+-  return (uint16x4_t) {__a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vdupq_n_u8 (uint32_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vdup_n_u32 (uint32_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
  {
--  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
--		       __a, __a, __a, __a, __a, __a, __a, __a};
+-  return (uint32x2_t) {__a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vdupq_n_u16 (uint32_t __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vdup_n_u64 (uint64_t __a)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
  {
--  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+-  return (uint64x1_t) {__a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vdupq_n_u32 (uint32_t __a)
+-/* vdupq_n  */
+-
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vdupq_n_f32 (float32_t __a)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
  {
--  return (uint32x4_t) {__a, __a, __a, __a};
+-  return (float32x4_t) {__a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vdupq_n_u64 (uint64_t __a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vdupq_n_f64 (float64_t __a)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
  {
--  return (uint64x2_t) {__a, __a};
+-  return (float64x2_t) {__a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--/* vdup_lane  */
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vdupq_n_p8 (uint32_t __a)
 +/* vld1q_lane  */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vdup_lane_f32 (float32x2_t __a, const int __b)
++
 +__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_f32 (__a, __b);
+-  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+-		       __a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vdup_lane_f64 (float64x1_t __a, const int __b)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vdupq_n_p16 (uint32_t __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_f64 (__a, __b);
+-  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vdup_lane_p8 (poly8x8_t __a, const int __b)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vdupq_n_s8 (int32_t __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_p8 (__a, __b);
+-  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+-		      __a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vdup_lane_p16 (poly16x4_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vdupq_n_s16 (int32_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_p16 (__a, __b);
+-  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vdup_lane_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vdupq_n_s32 (int32_t __a)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_s8 (__a, __b);
+-  return (int32x4_t) {__a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vdup_lane_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vdupq_n_s64 (int64_t __a)
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_lane_p64 (const poly64_t *__src, poly64x2_t __vec, const int __lane)
+ {
+-  return (int64x2_t) {__a, __a};
++  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+ }
+ 
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vdupq_n_u8 (uint32_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_s16 (__a, __b);
+-  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+-		       __a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vdup_lane_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vdupq_n_u16 (uint32_t __a)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_s32 (__a, __b);
+-  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vdup_lane_s64 (int64x1_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vdupq_n_u32 (uint32_t __a)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_s64 (__a, __b);
+-  return (uint32x4_t) {__a, __a, __a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vdup_lane_u8 (uint8x8_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vdupq_n_u64 (uint64_t __a)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_u8 (__a, __b);
+-  return (uint64x2_t) {__a, __a};
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vdup_lane_u16 (uint16x4_t __a, const int __b)
+-/* vdup_lane  */
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vdup_lane_f32 (float32x2_t __a, const int __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_u16 (__a, __b);
+-  return __aarch64_vdup_lane_f32 (__a, __b);
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vdup_lane_u32 (uint32x2_t __a, const int __b)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vdup_lane_f64 (float64x1_t __a, const int __b)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_u32 (__a, __b);
+-  return __aarch64_vdup_lane_f64 (__a, __b);
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vdup_lane_u64 (uint64x1_t __a, const int __b)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vdup_lane_p8 (poly8x8_t __a, const int __b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
  {
--  return __aarch64_vdup_lane_u64 (__a, __b);
+-  return __aarch64_vdup_lane_p8 (__a, __b);
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--/* vdup_laneq  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vdup_laneq_f32 (float32x4_t __a, const int __b)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vdup_lane_p16 (poly16x4_t __a, const int __b)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
  {
--  return __aarch64_vdup_laneq_f32 (__a, __b);
+-  return __aarch64_vdup_lane_p16 (__a, __b);
 +  return __aarch64_vset_lane_any (*__src, __vec, __lane);
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vdup_laneq_f64 (float64x2_t __a, const int __b)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vdup_lane_s8 (int8x8_t __a, const int __b)
 +/* vldn */
 +
 +__extension__ extern __inline int64x1x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_s64 (const int64_t * __a)
  {
--  return __aarch64_vdup_laneq_f64 (__a, __b);
+-  return __aarch64_vdup_lane_s8 (__a, __b);
 +  int64x1x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
@@ -27582,13 +29184,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vdup_laneq_p8 (poly8x16_t __a, const int __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vdup_lane_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline uint64x1x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_u64 (const uint64_t * __a)
  {
--  return __aarch64_vdup_laneq_p8 (__a, __b);
+-  return __aarch64_vdup_lane_s16 (__a, __b);
 +  uint64x1x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
@@ -27597,13 +29199,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vdup_laneq_p16 (poly16x8_t __a, const int __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vdup_lane_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline float64x1x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_f64 (const float64_t * __a)
  {
--  return __aarch64_vdup_laneq_p16 (__a, __b);
+-  return __aarch64_vdup_lane_s32 (__a, __b);
 +  float64x1x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a);
@@ -27612,13 +29214,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vdup_laneq_s8 (int8x16_t __a, const int __b)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vdup_lane_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline int8x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_s8 (const int8_t * __a)
  {
--  return __aarch64_vdup_laneq_s8 (__a, __b);
+-  return __aarch64_vdup_lane_s64 (__a, __b);
 +  int8x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -27627,13 +29229,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vdup_laneq_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vdup_lane_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline poly8x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_p8 (const poly8_t * __a)
  {
--  return __aarch64_vdup_laneq_s16 (__a, __b);
+-  return __aarch64_vdup_lane_u8 (__a, __b);
 +  poly8x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -27642,13 +29244,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vdup_laneq_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vdup_lane_u16 (uint16x4_t __a, const int __b)
++__extension__ extern __inline poly64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2_p64 (const poly64_t * __a)
+ {
+-  return __aarch64_vdup_lane_u16 (__a, __b);
++  poly64x1x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
++  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
++  return ret;
+ }
+ 
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vdup_lane_u32 (uint32x2_t __a, const int __b)
 +__extension__ extern __inline int16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_s16 (const int16_t * __a)
  {
--  return __aarch64_vdup_laneq_s32 (__a, __b);
+-  return __aarch64_vdup_lane_u32 (__a, __b);
 +  int16x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -27657,13 +29274,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vdup_laneq_s64 (int64x2_t __a, const int __b)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vdup_lane_u64 (uint64x1_t __a, const int __b)
 +__extension__ extern __inline poly16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_p16 (const poly16_t * __a)
  {
--  return __aarch64_vdup_laneq_s64 (__a, __b);
+-  return __aarch64_vdup_lane_u64 (__a, __b);
 +  poly16x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -27672,13 +29289,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vdup_laneq_u8 (uint8x16_t __a, const int __b)
+-/* vdup_laneq  */
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vdup_laneq_f32 (float32x4_t __a, const int __b)
 +__extension__ extern __inline int32x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_s32 (const int32_t * __a)
  {
--  return __aarch64_vdup_laneq_u8 (__a, __b);
+-  return __aarch64_vdup_laneq_f32 (__a, __b);
 +  int32x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
@@ -27687,13 +29306,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vdup_laneq_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vdup_laneq_f64 (float64x2_t __a, const int __b)
 +__extension__ extern __inline uint8x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_u8 (const uint8_t * __a)
  {
--  return __aarch64_vdup_laneq_u16 (__a, __b);
+-  return __aarch64_vdup_laneq_f64 (__a, __b);
 +  uint8x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -27702,13 +29321,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vdup_laneq_u32 (uint32x4_t __a, const int __b)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vdup_laneq_p8 (poly8x16_t __a, const int __b)
 +__extension__ extern __inline uint16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_u16 (const uint16_t * __a)
  {
--  return __aarch64_vdup_laneq_u32 (__a, __b);
+-  return __aarch64_vdup_laneq_p8 (__a, __b);
 +  uint16x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -27717,13 +29336,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vdup_laneq_u64 (uint64x2_t __a, const int __b)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vdup_laneq_p16 (poly16x8_t __a, const int __b)
 +__extension__ extern __inline uint32x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_u32 (const uint32_t * __a)
  {
--  return __aarch64_vdup_laneq_u64 (__a, __b);
+-  return __aarch64_vdup_laneq_p16 (__a, __b);
 +  uint32x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
@@ -27732,14 +29351,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdupq_lane  */
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vdupq_lane_f32 (float32x2_t __a, const int __b)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vdup_laneq_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline float16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_f16 (const float16_t * __a)
  {
--  return __aarch64_vdupq_lane_f32 (__a, __b);
+-  return __aarch64_vdup_laneq_s8 (__a, __b);
 +  float16x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4hf (__a);
@@ -27748,13 +29366,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vdupq_lane_f64 (float64x1_t __a, const int __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vdup_laneq_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline float32x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_f32 (const float32_t * __a)
  {
--  return __aarch64_vdupq_lane_f64 (__a, __b);
+-  return __aarch64_vdup_laneq_s16 (__a, __b);
 +  float32x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -27763,13 +29381,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vdupq_lane_p8 (poly8x8_t __a, const int __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vdup_laneq_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline int8x16x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_s8 (const int8_t * __a)
  {
--  return __aarch64_vdupq_lane_p8 (__a, __b);
+-  return __aarch64_vdup_laneq_s32 (__a, __b);
 +  int8x16x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -27778,13 +29396,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vdupq_lane_p16 (poly16x4_t __a, const int __b)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vdup_laneq_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline poly8x16x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_p8 (const poly8_t * __a)
  {
--  return __aarch64_vdupq_lane_p16 (__a, __b);
+-  return __aarch64_vdup_laneq_s64 (__a, __b);
 +  poly8x16x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -27793,13 +29411,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vdupq_lane_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vdup_laneq_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline int16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_s16 (const int16_t * __a)
  {
--  return __aarch64_vdupq_lane_s8 (__a, __b);
+-  return __aarch64_vdup_laneq_u8 (__a, __b);
 +  int16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -27808,13 +29426,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vdupq_lane_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vdup_laneq_u16 (uint16x8_t __a, const int __b)
 +__extension__ extern __inline poly16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_p16 (const poly16_t * __a)
  {
--  return __aarch64_vdupq_lane_s16 (__a, __b);
+-  return __aarch64_vdup_laneq_u16 (__a, __b);
 +  poly16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -27823,13 +29441,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vdupq_lane_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vdup_laneq_u32 (uint32x4_t __a, const int __b)
++__extension__ extern __inline poly64x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2q_p64 (const poly64_t * __a)
+ {
+-  return __aarch64_vdup_laneq_u32 (__a, __b);
++  poly64x2x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
++  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
++  return ret;
+ }
+ 
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vdup_laneq_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline int32x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_s32 (const int32_t * __a)
  {
--  return __aarch64_vdupq_lane_s32 (__a, __b);
+-  return __aarch64_vdup_laneq_u64 (__a, __b);
 +  int32x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
@@ -27838,13 +29471,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vdupq_lane_s64 (int64x1_t __a, const int __b)
+-/* vdupq_lane  */
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vdupq_lane_f32 (float32x2_t __a, const int __b)
 +__extension__ extern __inline int64x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_s64 (const int64_t * __a)
  {
--  return __aarch64_vdupq_lane_s64 (__a, __b);
+-  return __aarch64_vdupq_lane_f32 (__a, __b);
 +  int64x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
@@ -27853,13 +29487,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vdupq_lane_u8 (uint8x8_t __a, const int __b)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vdupq_lane_f64 (float64x1_t __a, const int __b)
 +__extension__ extern __inline uint8x16x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_u8 (const uint8_t * __a)
  {
--  return __aarch64_vdupq_lane_u8 (__a, __b);
+-  return __aarch64_vdupq_lane_f64 (__a, __b);
 +  uint8x16x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -27868,13 +29502,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vdupq_lane_u16 (uint16x4_t __a, const int __b)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vdupq_lane_p8 (poly8x8_t __a, const int __b)
 +__extension__ extern __inline uint16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_u16 (const uint16_t * __a)
  {
--  return __aarch64_vdupq_lane_u16 (__a, __b);
+-  return __aarch64_vdupq_lane_p8 (__a, __b);
 +  uint16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -27883,13 +29517,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vdupq_lane_u32 (uint32x2_t __a, const int __b)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vdupq_lane_p16 (poly16x4_t __a, const int __b)
 +__extension__ extern __inline uint32x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_u32 (const uint32_t * __a)
  {
--  return __aarch64_vdupq_lane_u32 (__a, __b);
+-  return __aarch64_vdupq_lane_p16 (__a, __b);
 +  uint32x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
@@ -27898,13 +29532,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vdupq_lane_u64 (uint64x1_t __a, const int __b)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vdupq_lane_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline uint64x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_u64 (const uint64_t * __a)
  {
--  return __aarch64_vdupq_lane_u64 (__a, __b);
+-  return __aarch64_vdupq_lane_s8 (__a, __b);
 +  uint64x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
@@ -27913,14 +29547,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdupq_laneq  */
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vdupq_laneq_f32 (float32x4_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vdupq_lane_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline float16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_f16 (const float16_t * __a)
  {
--  return __aarch64_vdupq_laneq_f32 (__a, __b);
+-  return __aarch64_vdupq_lane_s16 (__a, __b);
 +  float16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v8hf (__a);
@@ -27929,13 +29562,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vdupq_laneq_f64 (float64x2_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vdupq_lane_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline float32x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_f32 (const float32_t * __a)
  {
--  return __aarch64_vdupq_laneq_f64 (__a, __b);
+-  return __aarch64_vdupq_lane_s32 (__a, __b);
 +  float32x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -27944,13 +29577,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vdupq_laneq_p8 (poly8x16_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vdupq_lane_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline float64x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_f64 (const float64_t * __a)
  {
--  return __aarch64_vdupq_laneq_p8 (__a, __b);
+-  return __aarch64_vdupq_lane_s64 (__a, __b);
 +  float64x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
@@ -27959,13 +29592,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vdupq_laneq_p16 (poly16x8_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vdupq_lane_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline int64x1x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_s64 (const int64_t * __a)
  {
--  return __aarch64_vdupq_laneq_p16 (__a, __b);
+-  return __aarch64_vdupq_lane_u8 (__a, __b);
 +  int64x1x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
@@ -27975,13 +29608,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vdupq_laneq_s8 (int8x16_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vdupq_lane_u16 (uint16x4_t __a, const int __b)
 +__extension__ extern __inline uint64x1x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_u64 (const uint64_t * __a)
  {
--  return __aarch64_vdupq_laneq_s8 (__a, __b);
+-  return __aarch64_vdupq_lane_u16 (__a, __b);
 +  uint64x1x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
@@ -27991,13 +29624,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vdupq_laneq_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vdupq_lane_u32 (uint32x2_t __a, const int __b)
 +__extension__ extern __inline float64x1x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_f64 (const float64_t * __a)
  {
--  return __aarch64_vdupq_laneq_s16 (__a, __b);
+-  return __aarch64_vdupq_lane_u32 (__a, __b);
 +  float64x1x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a);
@@ -28007,13 +29640,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vdupq_laneq_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vdupq_lane_u64 (uint64x1_t __a, const int __b)
 +__extension__ extern __inline int8x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_s8 (const int8_t * __a)
  {
--  return __aarch64_vdupq_laneq_s32 (__a, __b);
+-  return __aarch64_vdupq_lane_u64 (__a, __b);
 +  int8x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28023,13 +29656,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vdupq_laneq_s64 (int64x2_t __a, const int __b)
+-/* vdupq_laneq  */
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vdupq_laneq_f32 (float32x4_t __a, const int __b)
 +__extension__ extern __inline poly8x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_p8 (const poly8_t * __a)
  {
--  return __aarch64_vdupq_laneq_s64 (__a, __b);
+-  return __aarch64_vdupq_laneq_f32 (__a, __b);
 +  poly8x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28039,13 +29673,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vdupq_laneq_u8 (uint8x16_t __a, const int __b)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vdupq_laneq_f64 (float64x2_t __a, const int __b)
 +__extension__ extern __inline int16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_s16 (const int16_t * __a)
  {
--  return __aarch64_vdupq_laneq_u8 (__a, __b);
+-  return __aarch64_vdupq_laneq_f64 (__a, __b);
 +  int16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -28055,13 +29689,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vdupq_laneq_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vdupq_laneq_p8 (poly8x16_t __a, const int __b)
 +__extension__ extern __inline poly16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_p16 (const poly16_t * __a)
  {
--  return __aarch64_vdupq_laneq_u16 (__a, __b);
+-  return __aarch64_vdupq_laneq_p8 (__a, __b);
 +  poly16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -28071,13 +29705,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vdupq_laneq_u32 (uint32x4_t __a, const int __b)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vdupq_laneq_p16 (poly16x8_t __a, const int __b)
 +__extension__ extern __inline int32x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_s32 (const int32_t * __a)
  {
--  return __aarch64_vdupq_laneq_u32 (__a, __b);
+-  return __aarch64_vdupq_laneq_p16 (__a, __b);
 +  int32x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
@@ -28087,13 +29721,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vdupq_laneq_u64 (uint64x2_t __a, const int __b)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vdupq_laneq_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline uint8x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_u8 (const uint8_t * __a)
  {
--  return __aarch64_vdupq_laneq_u64 (__a, __b);
+-  return __aarch64_vdupq_laneq_s8 (__a, __b);
 +  uint8x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28103,14 +29737,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdupb_lane  */
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
--vdupb_lane_p8 (poly8x8_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vdupq_laneq_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline uint16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_u16 (const uint16_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  return __aarch64_vdupq_laneq_s16 (__a, __b);
 +  uint16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -28120,13 +29753,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vdupb_lane_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vdupq_laneq_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline uint32x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_u32 (const uint32_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  return __aarch64_vdupq_laneq_s32 (__a, __b);
 +  uint32x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
@@ -28136,13 +29769,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vdupb_lane_u8 (uint8x8_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vdupq_laneq_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline float16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_f16 (const float16_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  return __aarch64_vdupq_laneq_s64 (__a, __b);
 +  float16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v4hf (__a);
@@ -28152,14 +29785,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vduph_lane  */
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
--vduph_lane_p16 (poly16x4_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vdupq_laneq_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline float32x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_f32 (const float32_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  return __aarch64_vdupq_laneq_u8 (__a, __b);
 +  float32x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -28169,13 +29801,29 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vduph_lane_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vdupq_laneq_u16 (uint16x8_t __a, const int __b)
++__extension__ extern __inline poly64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3_p64 (const poly64_t * __a)
+ {
+-  return __aarch64_vdupq_laneq_u16 (__a, __b);
++  poly64x1x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
++  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
++  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
++  return ret;
+ }
+ 
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vdupq_laneq_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline int8x16x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_s8 (const int8_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  return __aarch64_vdupq_laneq_u32 (__a, __b);
 +  int8x16x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28185,13 +29833,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vduph_lane_u16 (uint16x4_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vdupq_laneq_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline poly8x16x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_p8 (const poly8_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  return __aarch64_vdupq_laneq_u64 (__a, __b);
 +  poly8x16x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28201,9 +29849,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdups_lane  */
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vdups_lane_f32 (float32x2_t __a, const int __b)
+-/* vdupb_lane  */
+-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+-vdupb_lane_p8 (poly8x8_t __a, const int __b)
 +__extension__ extern __inline int16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_s16 (const int16_t * __a)
@@ -28218,8 +29866,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vdups_lane_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vdupb_lane_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline poly16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_p16 (const poly16_t * __a)
@@ -28234,8 +29882,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vdups_lane_u32 (uint32x2_t __a, const int __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vdupb_lane_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline int32x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_s32 (const int32_t * __a)
@@ -28250,15 +29898,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdupd_lane  */
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vdupd_lane_f64 (float64x1_t __a, const int __b)
+-/* vduph_lane  */
+-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+-vduph_lane_p16 (poly16x4_t __a, const int __b)
 +__extension__ extern __inline int64x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_s64 (const int64_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __b);
--  return __a[0];
+-  return __aarch64_vget_lane_any (__a, __b);
 +  int64x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
@@ -28268,14 +29915,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vdupd_lane_s64 (int64x1_t __a, const int __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vduph_lane_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline uint8x16x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_u8 (const uint8_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __b);
--  return __a[0];
+-  return __aarch64_vget_lane_any (__a, __b);
 +  uint8x16x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28285,14 +29931,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vdupd_lane_u64 (uint64x1_t __a, const int __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vduph_lane_u16 (uint16x4_t __a, const int __b)
 +__extension__ extern __inline uint16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_u16 (const uint16_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __b);
--  return __a[0];
+-  return __aarch64_vget_lane_any (__a, __b);
 +  uint16x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -28302,9 +29947,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdupb_laneq  */
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
--vdupb_laneq_p8 (poly8x16_t __a, const int __b)
+-/* vdups_lane  */
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vdups_lane_f32 (float32x2_t __a, const int __b)
 +__extension__ extern __inline uint32x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_u32 (const uint32_t * __a)
@@ -28319,8 +29964,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vdups_lane_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline uint64x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_u64 (const uint64_t * __a)
@@ -28335,8 +29980,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vdupb_laneq_u8 (uint8x16_t __a, const int __b)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vdups_lane_u32 (uint32x2_t __a, const int __b)
 +__extension__ extern __inline float16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_f16 (const float16_t * __a)
@@ -28351,14 +29996,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vduph_laneq  */
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
--vduph_laneq_p16 (poly16x8_t __a, const int __b)
+-/* vdupd_lane  */
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vdupd_lane_f64 (float64x1_t __a, const int __b)
 +__extension__ extern __inline float32x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_f32 (const float32_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  __AARCH64_LANE_CHECK (__a, __b);
+-  return __a[0];
 +  float32x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -28368,13 +30014,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vduph_laneq_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vdupd_lane_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline float64x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_f64 (const float64_t * __a)
  {
--  return __aarch64_vget_lane_any (__a, __b);
+-  __AARCH64_LANE_CHECK (__a, __b);
+-  return __a[0];
 +  float64x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
@@ -28384,8 +30031,26 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vduph_laneq_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vdupd_lane_u64 (uint64x1_t __a, const int __b)
++__extension__ extern __inline poly64x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3q_p64 (const poly64_t * __a)
+ {
+-  __AARCH64_LANE_CHECK (__a, __b);
+-  return __a[0];
++  poly64x2x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
++  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
++  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
++  return ret;
+ }
+ 
+-/* vdupb_laneq  */
+-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+-vdupb_laneq_p8 (poly8x16_t __a, const int __b)
 +__extension__ extern __inline int64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_s64 (const int64_t * __a)
@@ -28401,9 +30066,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdups_laneq  */
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vdups_laneq_f32 (float32x4_t __a, const int __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
 +__extension__ extern __inline uint64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_u64 (const uint64_t * __a)
@@ -28419,8 +30083,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vdups_laneq_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vdupb_laneq_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline float64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_f64 (const float64_t * __a)
@@ -28436,8 +30100,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vdups_laneq_u32 (uint32x4_t __a, const int __b)
+-/* vduph_laneq  */
+-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+-vduph_laneq_p16 (poly16x8_t __a, const int __b)
 +__extension__ extern __inline int8x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_s8 (const int8_t * __a)
@@ -28453,9 +30118,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vdupd_laneq  */
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vdupd_laneq_f64 (float64x2_t __a, const int __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vduph_laneq_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline poly8x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_p8 (const poly8_t * __a)
@@ -28471,8 +30135,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vdupd_laneq_s64 (int64x2_t __a, const int __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vduph_laneq_u16 (uint16x8_t __a, const int __b)
 +__extension__ extern __inline int16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_s16 (const int16_t * __a)
@@ -28488,8 +30152,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vdupd_laneq_u64 (uint64x2_t __a, const int __b)
+-/* vdups_laneq  */
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vdups_laneq_f32 (float32x4_t __a, const int __b)
 +__extension__ extern __inline poly16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_p16 (const poly16_t * __a)
@@ -28505,20 +30170,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vext  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vdups_laneq_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline int32x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_s32 (const int32_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
--#endif
+-  return __aarch64_vget_lane_any (__a, __b);
 +  int32x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
@@ -28529,27 +30187,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
--{
--  __AARCH64_LANE_CHECK (__a, __c);
--  /* The only possible index to the assembler instruction returns element 0.  */
--  return __a;
--}
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vdups_laneq_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline uint8x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_u8 (const uint8_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
+-  return __aarch64_vget_lane_any (__a, __b);
 +  uint8x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28560,19 +30204,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
+-/* vdupd_laneq  */
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vdupd_laneq_f64 (float64x2_t __a, const int __b)
 +__extension__ extern __inline uint16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_u16 (const uint16_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
+-  return __aarch64_vget_lane_any (__a, __b);
 +  uint16x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -28583,20 +30222,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vdupd_laneq_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline uint32x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_u32 (const uint32_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
+-  return __aarch64_vget_lane_any (__a, __b);
 +  uint32x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
@@ -28607,19 +30239,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vdupd_laneq_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline float16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_f16 (const float16_t * __a)
  {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
+-  return __aarch64_vget_lane_any (__a, __b);
 +  float16x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v4hf (__a);
@@ -28630,8 +30256,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
+-/* vext  */
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
 +__extension__ extern __inline float32x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_f32 (const float32_t * __a)
@@ -28652,15 +30280,39 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
-+__extension__ extern __inline int8x16x4_t
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
++__extension__ extern __inline poly64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_s8 (const int8_t * __a)
++vld4_p64 (const poly64_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -  /* The only possible index to the assembler instruction returns element 0.  */
 -  return __a;
++  poly64x1x4_t  ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
++  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
++  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
++  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
++  return ret;
+ }
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
++
++__extension__ extern __inline int8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4q_s8 (const int8_t * __a)
+ {
+-  __AARCH64_LANE_CHECK (__a, __c);
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a, (uint8x8_t)
+-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+-#else
+-  return __builtin_shuffle (__a, __b,
+-      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+-#endif
 +  int8x16x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -28671,19 +30323,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
 +__extension__ extern __inline poly8x16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_p8 (const poly8_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+-  return __builtin_shuffle (__b, __a,
+-      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
 -#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+-  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
 -#endif
 +  poly8x16x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28695,18 +30346,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
 +__extension__ extern __inline int16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_s16 (const int16_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+-  return __builtin_shuffle (__b, __a, (uint8x8_t)
+-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
 -#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
+-  return __builtin_shuffle (__a, __b,
+-      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
 -#endif
 +  int16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28718,17 +30370,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
 +__extension__ extern __inline poly16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_p16 (const poly16_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
+-  return __builtin_shuffle (__b, __a,
+-      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
 -#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
+-  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
 -#endif
 +  poly16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28740,15 +30393,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
 +__extension__ extern __inline int32x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_s32 (const int32_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
--  /* The only possible index to the assembler instruction returns element 0.  */
--  return __a;
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
+-#else
+-  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
+-#endif
 +  int32x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
@@ -28759,19 +30415,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
 +__extension__ extern __inline int64x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_s64 (const int64_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
+-  /* The only possible index to the assembler instruction returns element 0.  */
+-  return __a;
 +  int64x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
@@ -28782,17 +30434,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
 +__extension__ extern __inline uint8x16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_u8 (const uint8_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+-  return __builtin_shuffle (__b, __a, (uint8x8_t)
+-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
 -#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+-  return __builtin_shuffle (__a, __b,
+-      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
 -#endif
 +  uint8x16x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28804,21 +30458,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
 +__extension__ extern __inline uint16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_u16 (const uint16_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x16_t)
--      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
--       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+-  return __builtin_shuffle (__b, __a,
+-      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
 -#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
--       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+-  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
 -#endif
 +  uint16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28830,19 +30481,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
 +__extension__ extern __inline uint32x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_u32 (const uint32_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint16x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+-  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
 -#else
--  return __builtin_shuffle (__a, __b,
--      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+-  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
 -#endif
 +  uint32x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28854,22 +30503,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
 +__extension__ extern __inline uint64x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_u64 (const uint64_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x16_t)
--      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
--       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
--       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
--#endif
+-  /* The only possible index to the assembler instruction returns element 0.  */
+-  return __a;
 +  uint64x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
@@ -28880,19 +30522,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
 +__extension__ extern __inline float16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_f16 (const float16_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint16x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+-  return __builtin_shuffle (__b, __a,
+-      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
 -#else
--  return __builtin_shuffle (__a, __b,
--      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+-  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
 -#endif
 +  float16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28904,18 +30545,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
 +__extension__ extern __inline float32x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_f32 (const float32_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+-  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
 -#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
+-  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
 -#endif
 +  float32x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28927,17 +30567,21 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
 +__extension__ extern __inline float64x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_f64 (const float64_t * __a)
  {
 -  __AARCH64_LANE_CHECK (__a, __c);
 -#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+-  return __builtin_shuffle (__b, __a, (uint8x16_t)
+-      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+-       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
 -#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+-  return __builtin_shuffle (__a, __b, (uint8x16_t)
+-      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+-       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
 -#endif
 +  float64x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
@@ -28949,8 +30593,32 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
++__extension__ extern __inline poly64x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4q_p64 (const poly64_t * __a)
+ {
+-  __AARCH64_LANE_CHECK (__a, __c);
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a, (uint16x8_t)
+-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+-#else
+-  return __builtin_shuffle (__a, __b,
+-      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+-#endif
++  poly64x2x4_t  ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
++  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
++  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
++  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
++  return ret;
+ }
+ 
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
 +/* vldn_dup */
 +
 +__extension__ extern __inline int8x8x2_t
@@ -28975,8 +30643,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
 +__extension__ extern __inline int16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_s16 (const int16_t * __a)
@@ -28997,8 +30665,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
 +__extension__ extern __inline int32x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_s32 (const int32_t * __a)
@@ -29018,8 +30686,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
 +__extension__ extern __inline float16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_f16 (const float16_t * __a)
@@ -29038,15 +30706,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfma  */
--
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
 +__extension__ extern __inline float32x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_f32 (const float32_t * __a)
  {
--  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
+-  __AARCH64_LANE_CHECK (__a, __c);
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a, (uint8x16_t)
+-      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+-       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+-#else
+-  return __builtin_shuffle (__a, __b, (uint8x16_t)
+-      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+-       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+-#endif
 +  float32x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -29055,13 +30730,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
 +__extension__ extern __inline float64x1x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_f64 (const float64_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (__b, __c, __a);
+-  __AARCH64_LANE_CHECK (__a, __c);
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a, (uint16x8_t)
+-      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+-#else
+-  return __builtin_shuffle (__a, __b,
+-      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+-#endif
 +  float64x1x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a);
@@ -29070,13 +30752,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
 +__extension__ extern __inline uint8x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_u8 (const uint8_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (__b, __c, __a);
+-  __AARCH64_LANE_CHECK (__a, __c);
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a,
+-      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
+-#else
+-  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
+-#endif
 +  uint8x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29085,13 +30773,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
 +__extension__ extern __inline uint16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_u16 (const uint16_t * __a)
  {
--  return __builtin_aarch64_fmav2df (__b, __c, __a);
+-  __AARCH64_LANE_CHECK (__a, __c);
+-#ifdef __AARCH64EB__
+-  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+-#else
+-  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+-#endif
 +  uint16x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29100,13 +30793,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+-/* vfma  */
+-
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 +__extension__ extern __inline uint32x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_u32 (const uint32_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
+-  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
 +  uint32x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
@@ -29115,13 +30810,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 +__extension__ extern __inline poly8x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_p8 (const poly8_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
+-  return __builtin_aarch64_fmav2sf (__b, __c, __a);
 +  poly8x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29130,13 +30825,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 +__extension__ extern __inline poly16x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_p16 (const poly16_t * __a)
  {
--  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
+-  return __builtin_aarch64_fmav4sf (__b, __c, __a);
 +  poly16x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29145,18 +30840,29 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfma_lane  */
--
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
++__extension__ extern __inline poly64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2_dup_p64 (const poly64_t * __a)
+ {
+-  return __builtin_aarch64_fmav2df (__b, __c, __a);
++  poly64x1x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
++  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
++  return ret;
+ }
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x2_t __c, const int __lane)
+-vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
++
 +__extension__ extern __inline int64x1x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_s64 (const int64_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (__b,
--				    __aarch64_vdup_lane_f32 (__c, __lane),
--				    __a);
+-  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
 +  int64x1x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
@@ -29165,14 +30871,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
--	       float64x1_t __c, const int __lane)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 +__extension__ extern __inline uint64x1x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2_dup_u64 (const uint64_t * __a)
  {
--  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
+-  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
 +  uint64x1x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
@@ -29181,14 +30886,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmad_lane_f64 (float64_t __a, float64_t __b,
--	        float64x1_t __c, const int __lane)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
 +__extension__ extern __inline int8x16x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_s8 (const int8_t * __a)
  {
--  return __builtin_fma (__b, __c[0], __a);
+-  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
 +  int8x16x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29197,14 +30901,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmas_lane_f32 (float32_t __a, float32_t __b,
--	        float32x2_t __c, const int __lane)
+-/* vfma_lane  */
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
+-	       float32x2_t __c, const int __lane)
 +__extension__ extern __inline poly8x16x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_p8 (const poly8_t * __a)
  {
--  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
+-  return __builtin_aarch64_fmav2sf (__b,
+-				    __aarch64_vdup_lane_f32 (__c, __lane),
+-				    __a);
 +  poly8x16x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29213,18 +30921,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfma_laneq  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
--	        float32x4_t __c, const int __lane)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
+-	       float64x1_t __c, const int __lane)
 +__extension__ extern __inline int16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_s16 (const int16_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (__b,
--				    __aarch64_vdup_laneq_f32 (__c, __lane),
--				    __a);
+-  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
 +  int16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29233,15 +30937,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
--	        float64x2_t __c, const int __lane)
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vfmad_lane_f64 (float64_t __a, float64_t __b,
+-	        float64x1_t __c, const int __lane)
 +__extension__ extern __inline poly16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_p16 (const poly16_t * __a)
  {
--  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
--  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
+-  return __builtin_fma (__b, __c[0], __a);
 +  poly16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29250,14 +30953,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmad_laneq_f64 (float64_t __a, float64_t __b,
--	         float64x2_t __c, const int __lane)
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vfmas_lane_f32 (float32_t __a, float32_t __b,
+-	        float32x2_t __c, const int __lane)
 +__extension__ extern __inline int32x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_s32 (const int32_t * __a)
  {
--  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
+-  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
 +  int32x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
@@ -29266,33 +30969,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmas_laneq_f32 (float32_t __a, float32_t __b,
--		 float32x4_t __c, const int __lane)
+-/* vfma_laneq  */
 +__extension__ extern __inline int64x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_s64 (const int64_t * __a)
- {
--  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
++{
 +  int64x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
 +  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
 +  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
 +  return ret;
- }
++}
  
--/* vfmaq_lane  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
--	        float32x2_t __c, const int __lane)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
+-	        float32x4_t __c, const int __lane)
 +__extension__ extern __inline uint8x16x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_u8 (const uint8_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (__b,
--				    __aarch64_vdupq_lane_f32 (__c, __lane),
+-  return __builtin_aarch64_fmav2sf (__b,
+-				    __aarch64_vdup_laneq_f32 (__c, __lane),
 -				    __a);
 +  uint8x16x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
@@ -29302,14 +31000,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
--	        float64x1_t __c, const int __lane)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
+-	        float64x2_t __c, const int __lane)
 +__extension__ extern __inline uint16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_u16 (const uint16_t * __a)
  {
--  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
+-  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
+-  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
 +  uint16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29318,18 +31017,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfmaq_laneq  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
--	         float32x4_t __c, const int __lane)
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vfmad_laneq_f64 (float64_t __a, float64_t __b,
+-	         float64x2_t __c, const int __lane)
 +__extension__ extern __inline uint32x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_u32 (const uint32_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (__b,
--				    __aarch64_vdupq_laneq_f32 (__c, __lane),
--				    __a);
+-  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
 +  uint32x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
@@ -29338,16 +31033,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
--	         float64x2_t __c, const int __lane)
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vfmas_laneq_f32 (float32_t __a, float32_t __b,
+-		 float32x4_t __c, const int __lane)
 +__extension__ extern __inline uint64x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_u64 (const uint64_t * __a)
  {
--  return __builtin_aarch64_fmav2df (__b,
--				    __aarch64_vdupq_laneq_f64 (__c, __lane),
--				    __a);
+-  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
 +  uint64x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
@@ -29356,15 +31049,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfms  */
+-/* vfmaq_lane  */
 -
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+-	        float32x2_t __c, const int __lane)
 +__extension__ extern __inline float16x8x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_f16 (const float16_t * __a)
  {
--  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
+-  return __builtin_aarch64_fmav4sf (__b,
+-				    __aarch64_vdupq_lane_f32 (__c, __lane),
+-				    __a);
 +  float16x8x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
@@ -29373,13 +31069,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
+-	        float64x1_t __c, const int __lane)
 +__extension__ extern __inline float32x4x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_f32 (const float32_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
+-  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
 +  float32x4x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -29388,13 +31085,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
+-/* vfmaq_laneq  */
+-
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+-vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+-	         float32x4_t __c, const int __lane)
 +__extension__ extern __inline float64x2x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld2q_dup_f64 (const float64_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
+-  return __builtin_aarch64_fmav4sf (__b,
+-				    __aarch64_vdupq_laneq_f32 (__c, __lane),
+-				    __a);
 +  float64x2x2_t ret;
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a);
@@ -29404,12 +31106,32 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+-vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+-	         float64x2_t __c, const int __lane)
++__extension__ extern __inline poly64x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2q_dup_p64 (const poly64_t * __a)
+ {
+-  return __builtin_aarch64_fmav2df (__b,
+-				    __aarch64_vdupq_laneq_f64 (__c, __lane),
+-				    __a);
++  poly64x2x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
++  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
++  return ret;
+ }
+ 
+-/* vfms  */
+-
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 +__extension__ extern __inline int64x1x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_s64 (const int64_t * __a)
  {
--  return __builtin_aarch64_fmav2df (-__b, __c, __a);
+-  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
 +  int64x1x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
@@ -29419,19 +31141,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--
--/* vfms_lane  */
--
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x2_t __c, const int __lane)
+-vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 +__extension__ extern __inline uint64x1x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_u64 (const uint64_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (-__b,
--				    __aarch64_vdup_lane_f32 (__c, __lane),
--				    __a);
+-  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
 +  uint64x1x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
@@ -29441,14 +31157,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
--	       float64x1_t __c, const int __lane)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 +__extension__ extern __inline float64x1x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_f64 (const float64_t * __a)
  {
--  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
+-  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
 +  float64x1x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a);
@@ -29458,14 +31173,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmsd_lane_f64 (float64_t __a, float64_t __b,
--	        float64x1_t __c, const int __lane)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
 +__extension__ extern __inline int8x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_s8 (const int8_t * __a)
  {
--  return __builtin_fma (-__b, __c[0], __a);
+-  return __builtin_aarch64_fmav2df (-__b, __c, __a);
 +  int8x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29475,15 +31189,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmss_lane_f32 (float32_t __a, float32_t __b,
--	        float32x2_t __c, const int __lane)
+-
+-/* vfms_lane  */
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
+-	       float32x2_t __c, const int __lane)
 +__extension__ extern __inline poly8x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_p8 (const poly8_t * __a)
  {
--  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  poly8x8x3_t ret;
+-  return __builtin_aarch64_fmav2sf (-__b,
+-				    __aarch64_vdup_lane_f32 (__c, __lane),
+-				    __a);
++  poly8x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
 +  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
@@ -29492,18 +31211,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfms_laneq  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
--	        float32x4_t __c, const int __lane)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
+-	       float64x1_t __c, const int __lane)
 +__extension__ extern __inline int16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_s16 (const int16_t * __a)
  {
--  return __builtin_aarch64_fmav2sf (-__b,
--				    __aarch64_vdup_laneq_f32 (__c, __lane),
--				    __a);
+-  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
 +  int16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29513,15 +31228,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
--	        float64x2_t __c, const int __lane)
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vfmsd_lane_f64 (float64_t __a, float64_t __b,
+-	        float64x1_t __c, const int __lane)
 +__extension__ extern __inline poly16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_p16 (const poly16_t * __a)
  {
--  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
--  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
+-  return __builtin_fma (-__b, __c[0], __a);
 +  poly16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29531,14 +31245,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmsd_laneq_f64 (float64_t __a, float64_t __b,
--	         float64x2_t __c, const int __lane)
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vfmss_lane_f32 (float32_t __a, float32_t __b,
+-	        float32x2_t __c, const int __lane)
 +__extension__ extern __inline int32x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_s32 (const int32_t * __a)
  {
--  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
+-  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
 +  int32x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
@@ -29548,14 +31262,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmss_laneq_f32 (float32_t __a, float32_t __b,
--		 float32x4_t __c, const int __lane)
+-/* vfms_laneq  */
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
+-	        float32x4_t __c, const int __lane)
 +__extension__ extern __inline uint8x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_u8 (const uint8_t * __a)
  {
--  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
+-  return __builtin_aarch64_fmav2sf (-__b,
+-				    __aarch64_vdup_laneq_f32 (__c, __lane),
+-				    __a);
 +  uint8x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29565,18 +31283,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfmsq_lane  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
--	        float32x2_t __c, const int __lane)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
+-	        float64x2_t __c, const int __lane)
 +__extension__ extern __inline uint16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_u16 (const uint16_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (-__b,
--				    __aarch64_vdupq_lane_f32 (__c, __lane),
--				    __a);
+-  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
+-  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
 +  uint16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29586,14 +31301,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
--	        float64x1_t __c, const int __lane)
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vfmsd_laneq_f64 (float64_t __a, float64_t __b,
+-	         float64x2_t __c, const int __lane)
 +__extension__ extern __inline uint32x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_u32 (const uint32_t * __a)
  {
--  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
+-  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
 +  uint32x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
@@ -29603,18 +31318,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vfmsq_laneq  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
--	         float32x4_t __c, const int __lane)
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vfmss_laneq_f32 (float32_t __a, float32_t __b,
+-		 float32x4_t __c, const int __lane)
 +__extension__ extern __inline float16x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_f16 (const float16_t * __a)
  {
--  return __builtin_aarch64_fmav4sf (-__b,
--				    __aarch64_vdupq_laneq_f32 (__c, __lane),
--				    __a);
+-  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
 +  float16x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
@@ -29624,15 +31335,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
--	         float64x2_t __c, const int __lane)
+-/* vfmsq_lane  */
+-
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+-	        float32x2_t __c, const int __lane)
 +__extension__ extern __inline float32x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3_dup_f32 (const float32_t * __a)
  {
--  return __builtin_aarch64_fmav2df (-__b,
--				    __aarch64_vdupq_laneq_f64 (__c, __lane),
+-  return __builtin_aarch64_fmav4sf (-__b,
+-				    __aarch64_vdupq_lane_f32 (__c, __lane),
 -				    __a);
 +  float32x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
@@ -29643,15 +31356,35 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vld1 */
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
+-	        float64x1_t __c, const int __lane)
++__extension__ extern __inline poly64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3_dup_p64 (const poly64_t * __a)
+ {
+-  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
++  poly64x1x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
++  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
++  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
++  return ret;
+ }
+ 
+-/* vfmsq_laneq  */
 -
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vld1_f16 (const float16_t *__a)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+-	         float32x4_t __c, const int __lane)
 +__extension__ extern __inline int8x16x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_s8 (const int8_t * __a)
  {
--  return __builtin_aarch64_ld1v4hf (__a);
+-  return __builtin_aarch64_fmav4sf (-__b,
+-				    __aarch64_vdupq_laneq_f32 (__c, __lane),
+-				    __a);
 +  int8x16x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29661,13 +31394,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vld1_f32 (const float32_t *a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+-	         float64x2_t __c, const int __lane)
 +__extension__ extern __inline poly8x16x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_p8 (const poly8_t * __a)
  {
--  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
+-  return __builtin_aarch64_fmav2df (-__b,
+-				    __aarch64_vdupq_laneq_f64 (__c, __lane),
+-				    __a);
 +  poly8x16x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29677,13 +31413,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vld1_f64 (const float64_t *a)
+-/* vld1 */
 +__extension__ extern __inline int16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_s16 (const int16_t * __a)
- {
--  return (float64x1_t) {*a};
++{
 +  int16x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29691,16 +31425,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 +  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 +  return ret;
- }
++}
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vld1_p8 (const poly8_t *a)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+-vld1_f16 (const float16_t *__a)
 +__extension__ extern __inline poly16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_p16 (const poly16_t * __a)
  {
--  return (poly8x8_t)
--    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+-  return __builtin_aarch64_ld1v4hf (__a);
 +  poly16x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29710,14 +31443,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vld1_p16 (const poly16_t *a)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vld1_f32 (const float32_t *a)
 +__extension__ extern __inline int32x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_s32 (const int32_t * __a)
  {
--  return (poly16x4_t)
--    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+-  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
 +  int32x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
@@ -29727,13 +31459,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vld1_s8 (const int8_t *a)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vld1_f64 (const float64_t *a)
 +__extension__ extern __inline int64x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_s64 (const int64_t * __a)
  {
--  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+-  return (float64x1_t) {*a};
 +  int64x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
@@ -29743,13 +31475,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vld1_s16 (const int16_t *a)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vld1_p8 (const poly8_t *a)
 +__extension__ extern __inline uint8x16x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_u8 (const uint8_t * __a)
  {
--  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+-  return (poly8x8_t)
+-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
 +  uint8x16x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29759,13 +31492,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vld1_s32 (const int32_t *a)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vld1_p16 (const poly16_t *a)
 +__extension__ extern __inline uint16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_u16 (const uint16_t * __a)
  {
--  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+-  return (poly16x4_t)
+-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
 +  uint16x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29775,13 +31509,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vld1_s64 (const int64_t *a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vld1_s8 (const int8_t *a)
 +__extension__ extern __inline uint32x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_u32 (const uint32_t * __a)
  {
--  return (int64x1_t) {*a};
+-  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
 +  uint32x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
@@ -29791,14 +31525,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vld1_u8 (const uint8_t *a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vld1_s16 (const int16_t *a)
 +__extension__ extern __inline uint64x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_u64 (const uint64_t * __a)
  {
--  return (uint8x8_t)
--    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+-  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
 +  uint64x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
@@ -29808,14 +31541,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vld1_u16 (const uint16_t *a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vld1_s32 (const int32_t *a)
 +__extension__ extern __inline float16x8x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_f16 (const float16_t * __a)
  {
--  return (uint16x4_t)
--    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+-  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
 +  float16x8x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
@@ -29825,14 +31557,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vld1_u32 (const uint32_t *a)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vld1_s64 (const int64_t *a)
 +__extension__ extern __inline float32x4x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_f32 (const float32_t * __a)
  {
--  return (uint32x2_t)
--    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+-  return (int64x1_t) {*a};
 +  float32x4x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -29842,13 +31573,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vld1_u64 (const uint64_t *a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vld1_u8 (const uint8_t *a)
 +__extension__ extern __inline float64x2x3_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld3q_dup_f64 (const float64_t * __a)
  {
--  return (uint64x1_t) {*a};
+-  return (uint8x8_t)
+-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
 +  float64x2x3_t ret;
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a);
@@ -29858,15 +31590,31 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vld1q */
--
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vld1q_f16 (const float16_t *__a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vld1_u16 (const uint16_t *a)
++__extension__ extern __inline poly64x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3q_dup_p64 (const poly64_t * __a)
+ {
+-  return (uint16x4_t)
+-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
++  poly64x2x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
++  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
++  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
++  return ret;
+ }
+ 
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vld1_u32 (const uint32_t *a)
 +__extension__ extern __inline int64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_s64 (const int64_t * __a)
  {
--  return __builtin_aarch64_ld1v8hf (__a);
+-  return (uint32x2_t)
+-    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
 +  int64x1x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
@@ -29877,13 +31625,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vld1q_f32 (const float32_t *a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vld1_u64 (const uint64_t *a)
 +__extension__ extern __inline uint64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_u64 (const uint64_t * __a)
  {
--  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
+-  return (uint64x1_t) {*a};
 +  uint64x1x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
@@ -29894,13 +31642,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vld1q_f64 (const float64_t *a)
+-/* vld1q */
 +__extension__ extern __inline float64x1x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_f64 (const float64_t * __a)
- {
--  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
++{
 +  float64x1x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a);
@@ -29909,16 +31655,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
 +  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
 +  return ret;
- }
++}
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vld1q_p8 (const poly8_t *a)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+-vld1q_f16 (const float16_t *__a)
 +__extension__ extern __inline int8x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_s8 (const int8_t * __a)
  {
--  return (poly8x16_t)
--    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+-  return __builtin_aarch64_ld1v8hf (__a);
 +  int8x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29929,14 +31674,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vld1q_p16 (const poly16_t *a)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vld1q_f32 (const float32_t *a)
 +__extension__ extern __inline poly8x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_p8 (const poly8_t * __a)
  {
--  return (poly16x8_t)
--    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+-  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
 +  poly8x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -29947,13 +31691,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vld1q_s8 (const int8_t *a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vld1q_f64 (const float64_t *a)
 +__extension__ extern __inline int16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_s16 (const int16_t * __a)
  {
--  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+-  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
 +  int16x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29964,13 +31708,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vld1q_s16 (const int16_t *a)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vld1q_p8 (const poly8_t *a)
 +__extension__ extern __inline poly16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_p16 (const poly16_t * __a)
  {
--  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+-  return (poly8x16_t)
+-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
 +  poly16x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -29981,13 +31726,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vld1q_s32 (const int32_t *a)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vld1q_p16 (const poly16_t *a)
 +__extension__ extern __inline int32x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_s32 (const int32_t * __a)
  {
--  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+-  return (poly16x8_t)
+-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
 +  int32x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
@@ -29998,13 +31744,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vld1q_s64 (const int64_t *a)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vld1q_s8 (const int8_t *a)
 +__extension__ extern __inline uint8x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_u8 (const uint8_t * __a)
  {
--  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+-  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
 +  uint8x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -30015,14 +31761,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vld1q_u8 (const uint8_t *a)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vld1q_s16 (const int16_t *a)
 +__extension__ extern __inline uint16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_u16 (const uint16_t * __a)
  {
--  return (uint8x16_t)
--    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+-  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
 +  uint16x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -30033,14 +31778,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vld1q_u16 (const uint16_t *a)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vld1q_s32 (const int32_t *a)
 +__extension__ extern __inline uint32x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_u32 (const uint32_t * __a)
  {
--  return (uint16x8_t)
--    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+-  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
 +  uint32x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
@@ -30051,14 +31795,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vld1q_u32 (const uint32_t *a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vld1q_s64 (const int64_t *a)
 +__extension__ extern __inline float16x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_f16 (const float16_t * __a)
  {
--  return (uint32x4_t)
--    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+-  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
 +  float16x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
@@ -30069,14 +31812,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vld1q_u64 (const uint64_t *a)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vld1q_u8 (const uint8_t *a)
 +__extension__ extern __inline float32x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4_dup_f32 (const float32_t * __a)
  {
--  return (uint64x2_t)
--    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+-  return (uint8x16_t)
+-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
 +  float32x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -30087,16 +31830,32 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vld1_dup  */
--
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vld1_dup_f16 (const float16_t* __a)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vld1q_u16 (const uint16_t *a)
++__extension__ extern __inline poly64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4_dup_p64 (const poly64_t * __a)
+ {
+-  return (uint16x8_t)
+-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
++  poly64x1x4_t ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
++  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
++  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
++  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
++  return ret;
+ }
+ 
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vld1q_u32 (const uint32_t *a)
 +__extension__ extern __inline int8x16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_s8 (const int8_t * __a)
  {
--  float16_t __f = *__a;
--  return (float16x4_t) { __f, __f, __f, __f };
+-  return (uint32x4_t)
+-    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
 +  int8x16x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -30107,13 +31866,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vld1_dup_f32 (const float32_t* __a)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vld1q_u64 (const uint64_t *a)
 +__extension__ extern __inline poly8x16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_p8 (const poly8_t * __a)
  {
--  return vdup_n_f32 (*__a);
+-  return (uint64x2_t)
+-    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
 +  poly8x16x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -30124,13 +31884,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vld1_dup_f64 (const float64_t* __a)
+-/* vld1_dup  */
+-
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+-vld1_dup_f16 (const float16_t* __a)
 +__extension__ extern __inline int16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_s16 (const int16_t * __a)
  {
--  return vdup_n_f64 (*__a);
+-  float16_t __f = *__a;
+-  return (float16x4_t) { __f, __f, __f, __f };
 +  int16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -30141,13 +31904,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vld1_dup_p8 (const poly8_t* __a)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vld1_dup_f32 (const float32_t* __a)
 +__extension__ extern __inline poly16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_p16 (const poly16_t * __a)
  {
--  return vdup_n_p8 (*__a);
+-  return vdup_n_f32 (*__a);
 +  poly16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -30158,13 +31921,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vld1_dup_p16 (const poly16_t* __a)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vld1_dup_f64 (const float64_t* __a)
 +__extension__ extern __inline int32x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_s32 (const int32_t * __a)
  {
--  return vdup_n_p16 (*__a);
+-  return vdup_n_f64 (*__a);
 +  int32x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
@@ -30175,13 +31938,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vld1_dup_s8 (const int8_t* __a)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vld1_dup_p8 (const poly8_t* __a)
 +__extension__ extern __inline int64x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_s64 (const int64_t * __a)
  {
--  return vdup_n_s8 (*__a);
+-  return vdup_n_p8 (*__a);
 +  int64x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
@@ -30192,13 +31955,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vld1_dup_s16 (const int16_t* __a)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vld1_dup_p16 (const poly16_t* __a)
 +__extension__ extern __inline uint8x16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_u8 (const uint8_t * __a)
  {
--  return vdup_n_s16 (*__a);
+-  return vdup_n_p16 (*__a);
 +  uint8x16x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
@@ -30209,13 +31972,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vld1_dup_s32 (const int32_t* __a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vld1_dup_s8 (const int8_t* __a)
 +__extension__ extern __inline uint16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_u16 (const uint16_t * __a)
  {
--  return vdup_n_s32 (*__a);
+-  return vdup_n_s8 (*__a);
 +  uint16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -30226,13 +31989,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vld1_dup_s64 (const int64_t* __a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vld1_dup_s16 (const int16_t* __a)
 +__extension__ extern __inline uint32x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_u32 (const uint32_t * __a)
  {
--  return vdup_n_s64 (*__a);
+-  return vdup_n_s16 (*__a);
 +  uint32x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
@@ -30243,13 +32006,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vld1_dup_u8 (const uint8_t* __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vld1_dup_s32 (const int32_t* __a)
 +__extension__ extern __inline uint64x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_u64 (const uint64_t * __a)
  {
--  return vdup_n_u8 (*__a);
+-  return vdup_n_s32 (*__a);
 +  uint64x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
@@ -30260,13 +32023,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vld1_dup_u16 (const uint16_t* __a)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vld1_dup_s64 (const int64_t* __a)
 +__extension__ extern __inline float16x8x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_f16 (const float16_t * __a)
  {
--  return vdup_n_u16 (*__a);
+-  return vdup_n_s64 (*__a);
 +  float16x8x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
@@ -30277,13 +32040,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vld1_dup_u32 (const uint32_t* __a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vld1_dup_u8 (const uint8_t* __a)
 +__extension__ extern __inline float32x4x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_f32 (const float32_t * __a)
  {
--  return vdup_n_u32 (*__a);
+-  return vdup_n_u8 (*__a);
 +  float32x4x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a);
@@ -30294,13 +32057,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vld1_dup_u64 (const uint64_t* __a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vld1_dup_u16 (const uint16_t* __a)
 +__extension__ extern __inline float64x2x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vld4q_dup_f64 (const float64_t * __a)
  {
--  return vdup_n_u64 (*__a);
+-  return vdup_n_u16 (*__a);
 +  float64x2x4_t ret;
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a);
@@ -30311,14 +32074,29 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;
  }
  
--/* vld1q_dup  */
-+/* vld2_lane */
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vld1_dup_u32 (const uint32_t* __a)
++__extension__ extern __inline poly64x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4q_dup_p64 (const poly64_t * __a)
+ {
+-  return vdup_n_u32 (*__a);
++  poly64x2x4_t ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
++  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
++  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
++  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
++  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
++  return ret;
+ }
  
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_f16 (const float16_t* __a)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vld1_dup_u64 (const uint64_t* __a)
 -{
--  float16_t __f = *__a;
--  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+-  return vdup_n_u64 (*__a);
++/* vld2_lane */
++
 +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
 +			 qmode, ptrmode, funcsuffix, signedtype)	   \
 +__extension__ extern __inline intype \
@@ -30344,11 +32122,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __b;								   \
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vld1q_dup_f32 (const float32_t* __a)
--{
--  return vdupq_n_f32 (*__a);
--}
+-/* vld1q_dup  */
 +__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
 +		 v8hf, hf, f16, float16x8_t)
 +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
@@ -30359,6 +32133,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +		 int8x16_t)
 +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi,
 +		 p16, int16x8_t)
++__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di,
++		 v2di_ssps, di, p64, poly64x2_t)
 +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
 +		 int8x16_t)
 +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
@@ -30376,17 +32152,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
 +		 u64, int64x2_t)
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vld1q_dup_f64 (const float64_t* __a)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+-vld1q_dup_f16 (const float16_t* __a)
 -{
--  return vdupq_n_f64 (*__a);
--}
+-  float16_t __f = *__a;
+-  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
 +#undef __LD2_LANE_FUNC
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vld1q_dup_p8 (const poly8_t* __a)
--{
--  return vdupq_n_p8 (*__a);
++
 +/* vld2q_lane */
 +
 +#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
@@ -30405,16 +32177,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;								   \
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_p16 (const poly16_t* __a)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vld1q_dup_f32 (const float32_t* __a)
 -{
--  return vdupq_n_p16 (*__a);
--}
+-  return vdupq_n_f32 (*__a);
 +__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
 +__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
 +__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
 +__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
 +__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
++__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
 +__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
 +__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
 +__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
@@ -30423,25 +32195,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
 +__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
 +__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vld1q_dup_s8 (const int8_t* __a)
--{
--  return vdupq_n_s8 (*__a);
--}
++
 +#undef __LD2_LANE_FUNC
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_s16 (const int16_t* __a)
--{
--  return vdupq_n_s16 (*__a);
--}
++
 +/* vld3_lane */
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vld1q_dup_s32 (const int32_t* __a)
--{
--  return vdupq_n_s32 (*__a);
++
 +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
 +			 qmode, ptrmode, funcsuffix, signedtype)	   \
 +__extension__ extern __inline intype \
@@ -30473,10 +32231,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __b;								   \
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vld1q_dup_s64 (const int64_t* __a)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vld1q_dup_f64 (const float64_t* __a)
 -{
--  return vdupq_n_s64 (*__a);
+-  return vdupq_n_f64 (*__a);
 -}
 +__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
 +		 v8hf, hf, f16, float16x8_t)
@@ -30488,6 +32246,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +		 int8x16_t)
 +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi,
 +		 p16, int16x8_t)
++__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di,
++		 v2di_ssps, di, p64, poly64x2_t)
 +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
 +		 int8x16_t)
 +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
@@ -30505,24 +32265,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
 +		 u64, int64x2_t)
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vld1q_dup_u8 (const uint8_t* __a)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vld1q_dup_p8 (const poly8_t* __a)
 -{
--  return vdupq_n_u8 (*__a);
+-  return vdupq_n_p8 (*__a);
 -}
 +#undef __LD3_LANE_FUNC
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_u16 (const uint16_t* __a)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vld1q_dup_p16 (const poly16_t* __a)
 -{
--  return vdupq_n_u16 (*__a);
+-  return vdupq_n_p16 (*__a);
 -}
 +/* vld3q_lane */
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vld1q_dup_u32 (const uint32_t* __a)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vld1q_dup_s8 (const int8_t* __a)
 -{
--  return vdupq_n_u32 (*__a);
+-  return vdupq_n_s8 (*__a);
 +#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
 +__extension__ extern __inline intype \
 +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
@@ -30541,16 +32301,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;								   \
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vld1q_dup_u64 (const uint64_t* __a)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vld1q_dup_s16 (const int16_t* __a)
 -{
--  return vdupq_n_u64 (*__a);
+-  return vdupq_n_s16 (*__a);
 -}
 +__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
 +__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
 +__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
 +__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
 +__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
++__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
 +__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
 +__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
 +__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
@@ -30560,20 +32321,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
 +__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
  
--/* vld1_lane  */
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vld1q_dup_s32 (const int32_t* __a)
+-{
+-  return vdupq_n_s32 (*__a);
+-}
 +#undef __LD3_LANE_FUNC
  
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vld1q_dup_s64 (const int64_t* __a)
 -{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+-  return vdupq_n_s64 (*__a);
 -}
 +/* vld4_lane */
  
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vld1q_dup_u8 (const uint8_t* __a)
 -{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+-  return vdupq_n_u8 (*__a);
 +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
 +			 qmode, ptrmode, funcsuffix, signedtype)	   \
 +__extension__ extern __inline intype \
@@ -30611,17 +32376,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __b;								   \
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vld1q_dup_u16 (const uint16_t* __a)
 -{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+-  return vdupq_n_u16 (*__a);
 -}
 +/* vld4q_lane */
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vld1q_dup_u32 (const uint32_t* __a)
 -{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+-  return vdupq_n_u32 (*__a);
 -}
 +__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
 +		 v8hf, hf, f16, float16x8_t)
@@ -30633,6 +32398,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +		 int8x16_t)
 +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi,
 +		 p16, int16x8_t)
++__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di,
++		 v2di_ssps, di, p64, poly64x2_t)
 +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
 +		 int8x16_t)
 +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
@@ -30650,22 +32417,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
 +		 u64, int64x2_t)
  
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vld1q_dup_u64 (const uint64_t* __a)
 -{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+-  return vdupq_n_u64 (*__a);
 -}
 +#undef __LD4_LANE_FUNC
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
--{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
--}
+-/* vld1_lane  */
 +/* vld4q_lane */
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+-vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
 -{
 -  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 +#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
@@ -30688,8 +32451,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return ret;								   \
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
 -{
 -  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 -}
@@ -30698,6 +32461,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
 +__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
 +__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
++__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
 +__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
 +__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
 +__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
@@ -30707,22 +32471,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
 +__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
 -{
 -  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 -}
 +#undef __LD4_LANE_FUNC
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
 -{
 -  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 -}
 +/* vmax */
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+-vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_f32 (float32x2_t __a, float32x2_t __b)
@@ -30731,8 +32495,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smax_nanv2sf (__a, __b);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_f64 (float64x1_t __a, float64x1_t __b)
@@ -30743,8 +32507,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +				      vget_lane_f64 (__b, 0)) };
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_s8 (int8x8_t __a, int8x8_t __b)
@@ -30753,10 +32517,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smaxv8qi (__a, __b);
  }
  
--/* vld1q_lane  */
--
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_s16 (int16x4_t __a, int16x4_t __b)
@@ -30765,8 +32527,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smaxv4hi (__a, __b);
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_s32 (int32x2_t __a, int32x2_t __b)
@@ -30775,8 +32537,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smaxv2si (__a, __b);
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_u8 (uint8x8_t __a, uint8x8_t __b)
@@ -30786,8 +32548,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						 (int8x8_t) __b);
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_u16 (uint16x4_t __a, uint16x4_t __b)
@@ -30797,8 +32559,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						  (int16x4_t) __b);
  }
  
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmax_u32 (uint32x2_t __a, uint32x2_t __b)
@@ -30808,8 +32570,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						  (int32x2_t) __b);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_f32 (float32x4_t __a, float32x4_t __b)
@@ -30818,8 +32580,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smax_nanv4sf (__a, __b);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
+-/* vld1q_lane  */
+-
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+-vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_f64 (float64x2_t __a, float64x2_t __b)
@@ -30828,8 +32592,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smax_nanv2df (__a, __b);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_s8 (int8x16_t __a, int8x16_t __b)
@@ -30838,8 +32602,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smaxv16qi (__a, __b);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_s16 (int16x8_t __a, int16x8_t __b)
@@ -30848,8 +32612,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smaxv8hi (__a, __b);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_s32 (int32x4_t __a, int32x4_t __b)
@@ -30858,8 +32622,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_smaxv4si (__a, __b);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+-vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
@@ -30869,8 +32633,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						   (int8x16_t) __b);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
@@ -30880,8 +32644,8 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						  (int16x8_t) __b);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
@@ -30892,13 +32656,73 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
 +/* vmulx */
  
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulx_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++  return __builtin_aarch64_fmulxv2sf (__a, __b);
+ }
+ 
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++  return __builtin_aarch64_fmulxv4sf (__a, __b);
+ }
+ 
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulx_f64 (float64x1_t __a, float64x1_t __b)
+ {
+-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
+ }
+ 
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+ {
+-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++  return __builtin_aarch64_fmulxv2df (__a, __b);
+ }
+ 
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
++__extension__ extern __inline float32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulxs_f32 (float32_t __a, float32_t __b)
+ {
+-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++  return __builtin_aarch64_fmulxsf (__a, __b);
+ }
+ 
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
++__extension__ extern __inline float64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulxd_f64 (float64_t __a, float64_t __b)
+ {
+-  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++  return __builtin_aarch64_fmulxdf (__a, __b);
+ }
+ 
 -/* vldn */
 -
 -__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
 -vld2_s64 (const int64_t * __a)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_f32 (float32x2_t __a, float32x2_t __b)
++vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
  {
 -  int64x1x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30906,14 +32730,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
 -  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fmulxv2sf (__a, __b);
++  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
  }
  
 -__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
 -vld2_u64 (const uint64_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
++vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
  {
 -  uint64x1x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30921,14 +32745,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
 -  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fmulxv4sf (__a, __b);
++  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
  }
  
 -__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__))
 -vld2_f64 (const float64_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_f64 (float64x1_t __a, float64x1_t __b)
++vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
  {
 -  float64x1x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30936,14 +32760,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
 -  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
 -  return ret;
-+  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
++  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
  }
  
 -__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
 -vld2_s8 (const int8_t * __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
++vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
  {
 -  int8x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30951,14 +32775,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
 -  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fmulxv2df (__a, __b);
++  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
  }
  
 -__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
 -vld2_p8 (const poly8_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxs_f32 (float32_t __a, float32_t __b)
++vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
  {
 -  poly8x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30966,14 +32790,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
 -  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fmulxsf (__a, __b);
++  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
  }
  
 -__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
 -vld2_s16 (const int16_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxd_f64 (float64_t __a, float64_t __b)
++vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
  {
 -  int16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30981,14 +32805,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
 -  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fmulxdf (__a, __b);
++  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
  }
  
 -__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
 -vld2_p16 (const poly16_t * __a)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
++vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
  {
 -  poly16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -30996,14 +32820,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
 -  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
 -  return ret;
-+  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
++  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
  }
  
 -__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
 -vld2_s32 (const int32_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
++vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
  {
 -  int32x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31011,14 +32835,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
 -  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
 -  return ret;
-+  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
++  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
  }
  
 -__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
 -vld2_u8 (const uint8_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
++vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
  {
 -  uint8x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31026,14 +32850,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
 -  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
 -  return ret;
-+  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
++  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
  }
  
 -__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
 -vld2_u16 (const uint16_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
++vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
  {
 -  uint16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31041,14 +32865,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
 -  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
 -  return ret;
-+  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
++  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
  }
  
 -__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
 -vld2_u32 (const uint32_t * __a)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
++vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
  {
 -  uint32x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31056,14 +32880,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
 -  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
 -  return ret;
-+  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
++  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
  }
  
 -__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
 -vld2_f16 (const float16_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
++vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
  {
 -  float16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31071,14 +32895,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
 -  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
 -  return ret;
-+  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
++  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
  }
  
 -__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 -vld2_f32 (const float32_t * __a)
-+__extension__ extern __inline float32x4_t
++/* vpmax  */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
++vpmax_s8 (int8x8_t a, int8x8_t b)
  {
 -  float32x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31086,14 +32912,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
 -  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
 -  return ret;
-+  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
++  return __builtin_aarch64_smaxpv8qi (a, b);
  }
  
 -__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 -vld2q_s8 (const int8_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
++vpmax_s16 (int16x4_t a, int16x4_t b)
  {
 -  int8x16x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31101,14 +32927,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
 -  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
 -  return ret;
-+  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
++  return __builtin_aarch64_smaxpv4hi (a, b);
  }
  
 -__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
 -vld2q_p8 (const poly8_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
++vpmax_s32 (int32x2_t a, int32x2_t b)
  {
 -  poly8x16x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31116,14 +32942,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
 -  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
 -  return ret;
-+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
++  return __builtin_aarch64_smaxpv2si (a, b);
  }
  
 -__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_s16 (const int16_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
++vpmax_u8 (uint8x8_t a, uint8x8_t b)
  {
 -  int16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31131,14 +32957,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
 -  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
 -  return ret;
-+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
++  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
++						  (int8x8_t) b);
  }
  
 -__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_p16 (const poly16_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
++vpmax_u16 (uint16x4_t a, uint16x4_t b)
  {
 -  poly16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31146,14 +32973,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
 -  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
 -  return ret;
-+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
++  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
++						   (int16x4_t) b);
  }
  
 -__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
 -vld2q_s32 (const int32_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
++vpmax_u32 (uint32x2_t a, uint32x2_t b)
  {
 -  int32x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31161,47 +32989,46 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
 -  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
 -  return ret;
-+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
++  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
++						   (int32x2_t) b);
  }
  
 -__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
 -vld2q_s64 (const int64_t * __a)
--{
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vpmaxq_s8 (int8x16_t a, int8x16_t b)
+ {
 -  int64x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
 -  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
 -  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
 -  return ret;
-+/* vpmax  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_s8 (int8x8_t a, int8x8_t b)
-+{
-+  return __builtin_aarch64_smaxpv8qi (a, b);
++  return __builtin_aarch64_smaxpv16qi (a, b);
  }
  
 -__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
 -vld2q_u8 (const uint8_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_s16 (int16x4_t a, int16x4_t b)
- {
+-{
 -  uint8x16x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
 -  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
 -  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_smaxpv4hi (a, b);
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vpmaxq_s16 (int16x8_t a, int16x8_t b)
++{
++  return __builtin_aarch64_smaxpv8hi (a, b);
  }
  
 -__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_u16 (const uint16_t * __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_s32 (int32x2_t a, int32x2_t b)
++vpmaxq_s32 (int32x4_t a, int32x4_t b)
  {
 -  uint16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31209,14 +33036,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
 -  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_smaxpv2si (a, b);
++  return __builtin_aarch64_smaxpv4si (a, b);
  }
  
 -__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
 -vld2q_u32 (const uint32_t * __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_u8 (uint8x8_t a, uint8x8_t b)
++vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
  {
 -  uint32x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31224,15 +33051,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
 -  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
 -  return ret;
-+  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
-+						  (int8x8_t) b);
++  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
++						    (int8x16_t) b);
  }
  
 -__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
 -vld2q_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_u16 (uint16x4_t a, uint16x4_t b)
++vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
  {
 -  uint64x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31240,15 +33067,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
 -  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
 -  return ret;
-+  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
-+						   (int16x4_t) b);
++  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
++						   (int16x8_t) b);
  }
  
 -__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_f16 (const float16_t * __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_u32 (uint32x2_t a, uint32x2_t b)
++vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
  {
 -  float16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31256,15 +33083,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
 -  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
 -  return ret;
-+  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
-+						   (int32x2_t) b);
++  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
++						   (int32x4_t) b);
  }
  
 -__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 -vld2q_f32 (const float32_t * __a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_s8 (int8x16_t a, int8x16_t b)
++vpmax_f32 (float32x2_t a, float32x2_t b)
  {
 -  float32x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31272,14 +33099,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
 -  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_smaxpv16qi (a, b);
++  return __builtin_aarch64_smax_nanpv2sf (a, b);
  }
  
 -__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
 -vld2q_f64 (const float64_t * __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_s16 (int16x8_t a, int16x8_t b)
++vpmaxq_f32 (float32x4_t a, float32x4_t b)
  {
 -  float64x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -31287,14 +33114,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
 -  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_smaxpv8hi (a, b);
++  return __builtin_aarch64_smax_nanpv4sf (a, b);
  }
  
 -__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
 -vld3_s64 (const int64_t * __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_s32 (int32x4_t a, int32x4_t b)
++vpmaxq_f64 (float64x2_t a, float64x2_t b)
  {
 -  int64x1x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31303,14 +33130,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
 -  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smaxpv4si (a, b);
++  return __builtin_aarch64_smax_nanpv2df (a, b);
  }
  
 -__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
 -vld3_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
++vpmaxqd_f64 (float64x2_t a)
  {
 -  uint64x1x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31319,15 +33146,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
 -  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
 -  return ret;
-+  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
-+						    (int8x16_t) b);
++  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
  }
  
 -__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__))
 -vld3_f64 (const float64_t * __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
++vpmaxs_f32 (float32x2_t a)
  {
 -  float64x1x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31336,15 +33162,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
 -  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
 -  return ret;
-+  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
-+						   (int16x8_t) b);
++  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
  }
  
 -__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
 -vld3_s8 (const int8_t * __a)
-+__extension__ extern __inline uint32x4_t
++/* vpmaxnm  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
++vpmaxnm_f32 (float32x2_t a, float32x2_t b)
  {
 -  int8x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31353,15 +33180,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
 -  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
 -  return ret;
-+  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
-+						   (int32x4_t) b);
++  return __builtin_aarch64_smaxpv2sf (a, b);
  }
  
 -__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
 -vld3_p8 (const poly8_t * __a)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_f32 (float32x2_t a, float32x2_t b)
++vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
  {
 -  poly8x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31370,14 +33196,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
 -  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smax_nanpv2sf (a, b);
++  return __builtin_aarch64_smaxpv4sf (a, b);
  }
  
 -__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
 -vld3_s16 (const int16_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_f32 (float32x4_t a, float32x4_t b)
++vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
  {
 -  int16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31386,14 +33212,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
 -  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smax_nanpv4sf (a, b);
++  return __builtin_aarch64_smaxpv2df (a, b);
  }
  
 -__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
 -vld3_p16 (const poly16_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_f64 (float64x2_t a, float64x2_t b)
++vpmaxnmqd_f64 (float64x2_t a)
  {
 -  poly16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31402,14 +33228,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
 -  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smax_nanpv2df (a, b);
++  return __builtin_aarch64_reduc_smax_scal_v2df (a);
  }
  
 -__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
 -vld3_s32 (const int32_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxqd_f64 (float64x2_t a)
++vpmaxnms_f32 (float32x2_t a)
  {
 -  int32x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31418,14 +33244,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
 -  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
++  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
  }
  
 -__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
 -vld3_u8 (const uint8_t * __a)
-+__extension__ extern __inline float32_t
++/* vpmin  */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxs_f32 (float32x2_t a)
++vpmin_s8 (int8x8_t a, int8x8_t b)
  {
 -  uint8x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31434,16 +33262,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
 -  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
++  return __builtin_aarch64_sminpv8qi (a, b);
  }
  
 -__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
 -vld3_u16 (const uint16_t * __a)
-+/* vpmaxnm  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnm_f32 (float32x2_t a, float32x2_t b)
++vpmin_s16 (int16x4_t a, int16x4_t b)
  {
 -  uint16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31452,14 +33278,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
 -  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smaxpv2sf (a, b);
++  return __builtin_aarch64_sminpv4hi (a, b);
  }
  
 -__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
 -vld3_u32 (const uint32_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
++vpmin_s32 (int32x2_t a, int32x2_t b)
  {
 -  uint32x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31468,14 +33294,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
 -  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smaxpv4sf (a, b);
++  return __builtin_aarch64_sminpv2si (a, b);
  }
  
 -__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
 -vld3_f16 (const float16_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
++vpmin_u8 (uint8x8_t a, uint8x8_t b)
  {
 -  float16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31484,14 +33310,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
 -  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_smaxpv2df (a, b);
++  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
++						  (int8x8_t) b);
  }
  
 -__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 -vld3_f32 (const float32_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmqd_f64 (float64x2_t a)
++vpmin_u16 (uint16x4_t a, uint16x4_t b)
  {
 -  float32x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31500,14 +33327,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
 -  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2df (a);
++  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
++						   (int16x4_t) b);
  }
  
 -__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 -vld3q_s8 (const int8_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnms_f32 (float32x2_t a)
++vpmin_u32 (uint32x2_t a, uint32x2_t b)
  {
 -  int8x16x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31516,16 +33344,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
 -  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
++  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
++						   (int32x2_t) b);
  }
  
 -__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
 -vld3q_p8 (const poly8_t * __a)
-+/* vpmin  */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_s8 (int8x8_t a, int8x8_t b)
++vpminq_s8 (int8x16_t a, int8x16_t b)
  {
 -  poly8x16x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31534,14 +33361,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
 -  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_sminpv8qi (a, b);
++  return __builtin_aarch64_sminpv16qi (a, b);
  }
  
 -__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_s16 (const int16_t * __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_s16 (int16x4_t a, int16x4_t b)
++vpminq_s16 (int16x8_t a, int16x8_t b)
  {
 -  int16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31550,14 +33377,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 -  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_sminpv4hi (a, b);
++  return __builtin_aarch64_sminpv8hi (a, b);
  }
  
 -__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_p16 (const poly16_t * __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_s32 (int32x2_t a, int32x2_t b)
++vpminq_s32 (int32x4_t a, int32x4_t b)
  {
 -  poly16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31566,14 +33393,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 -  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_sminpv2si (a, b);
++  return __builtin_aarch64_sminpv4si (a, b);
  }
  
 -__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
 -vld3q_s32 (const int32_t * __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_u8 (uint8x8_t a, uint8x8_t b)
++vpminq_u8 (uint8x16_t a, uint8x16_t b)
  {
 -  int32x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31582,16 +33409,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
 -  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
 -  return ret;
-+  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
-+						  (int8x8_t) b);
++  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
++						    (int8x16_t) b);
  }
  
 -__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
 -vld3q_s64 (const int64_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_u16 (uint16x4_t a, uint16x4_t b)
- {
+-{
 -  int64x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
@@ -31599,15 +33423,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
 -  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
 -  return ret;
-+  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
-+						   (int16x4_t) b);
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vpminq_u16 (uint16x8_t a, uint16x8_t b)
++{
++  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
++						   (int16x8_t) b);
  }
  
 -__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
 -vld3q_u8 (const uint8_t * __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_u32 (uint32x2_t a, uint32x2_t b)
++vpminq_u32 (uint32x4_t a, uint32x4_t b)
  {
 -  uint8x16x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31616,15 +33444,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
 -  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
 -  return ret;
-+  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
-+						   (int32x2_t) b);
++  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
++						   (int32x4_t) b);
  }
  
 -__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_u16 (const uint16_t * __a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_s8 (int8x16_t a, int8x16_t b)
++vpmin_f32 (float32x2_t a, float32x2_t b)
  {
 -  uint16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31633,14 +33461,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 -  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_sminpv16qi (a, b);
++  return __builtin_aarch64_smin_nanpv2sf (a, b);
  }
  
 -__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
 -vld3q_u32 (const uint32_t * __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_s16 (int16x8_t a, int16x8_t b)
++vpminq_f32 (float32x4_t a, float32x4_t b)
  {
 -  uint32x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31649,14 +33477,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
 -  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_sminpv8hi (a, b);
++  return __builtin_aarch64_smin_nanpv4sf (a, b);
  }
  
 -__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
 -vld3q_u64 (const uint64_t * __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_s32 (int32x4_t a, int32x4_t b)
++vpminq_f64 (float64x2_t a, float64x2_t b)
  {
 -  uint64x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31665,14 +33493,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
 -  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_sminpv4si (a, b);
++  return __builtin_aarch64_smin_nanpv2df (a, b);
  }
  
 -__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_f16 (const float16_t * __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_u8 (uint8x16_t a, uint8x16_t b)
++vpminqd_f64 (float64x2_t a)
  {
 -  float16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31681,15 +33509,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
 -  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
 -  return ret;
-+  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
-+						    (int8x16_t) b);
++  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
  }
  
 -__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 -vld3q_f32 (const float32_t * __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_u16 (uint16x8_t a, uint16x8_t b)
++vpmins_f32 (float32x2_t a)
  {
 -  float32x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31698,15 +33525,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
 -  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
 -  return ret;
-+  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
-+						   (int16x8_t) b);
++  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
  }
  
 -__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
 -vld3q_f64 (const float64_t * __a)
-+__extension__ extern __inline uint32x4_t
++/* vpminnm  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_u32 (uint32x4_t a, uint32x4_t b)
++vpminnm_f32 (float32x2_t a, float32x2_t b)
  {
 -  float64x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -31715,15 +33543,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
 -  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
 -  return ret;
-+  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
-+						   (int32x4_t) b);
++  return __builtin_aarch64_sminpv2sf (a, b);
  }
  
 -__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
 -vld4_s64 (const int64_t * __a)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_f32 (float32x2_t a, float32x2_t b)
++vpminnmq_f32 (float32x4_t a, float32x4_t b)
  {
 -  int64x1x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31733,14 +33560,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
 -  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_smin_nanpv2sf (a, b);
++  return __builtin_aarch64_sminpv4sf (a, b);
  }
  
 -__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
 -vld4_u64 (const uint64_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_f32 (float32x4_t a, float32x4_t b)
++vpminnmq_f64 (float64x2_t a, float64x2_t b)
  {
 -  uint64x1x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31750,14 +33577,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
 -  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_smin_nanpv4sf (a, b);
++  return __builtin_aarch64_sminpv2df (a, b);
  }
  
 -__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__))
 -vld4_f64 (const float64_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_f64 (float64x2_t a, float64x2_t b)
++vpminnmqd_f64 (float64x2_t a)
  {
 -  float64x1x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31767,14 +33594,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
 -  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
 -  return ret;
-+  return __builtin_aarch64_smin_nanpv2df (a, b);
++  return __builtin_aarch64_reduc_smin_scal_v2df (a);
  }
  
 -__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
 -vld4_s8 (const int8_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminqd_f64 (float64x2_t a)
++vpminnms_f32 (float32x2_t a)
  {
 -  int8x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31784,14 +33611,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
 -  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
++  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
  }
  
 -__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
 -vld4_p8 (const poly8_t * __a)
-+__extension__ extern __inline float32_t
++/* vmaxnm  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmins_f32 (float32x2_t a)
++vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
  {
 -  poly8x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31801,16 +33630,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
 -  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
++  return __builtin_aarch64_fmaxv2sf (__a, __b);
  }
  
 -__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
 -vld4_s16 (const int16_t * __a)
-+/* vpminnm  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnm_f32 (float32x2_t a, float32x2_t b)
++vmaxnm_f64 (float64x1_t __a, float64x1_t __b)
  {
 -  int16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31820,14 +33647,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
 -  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_sminpv2sf (a, b);
++  return (float64x1_t)
++    { __builtin_aarch64_fmaxdf (vget_lane_f64 (__a, 0),
++				vget_lane_f64 (__b, 0)) };
  }
  
 -__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
 -vld4_p16 (const poly16_t * __a)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmq_f32 (float32x4_t a, float32x4_t b)
++vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  poly16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31837,14 +33666,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
 -  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_sminpv4sf (a, b);
++  return __builtin_aarch64_fmaxv4sf (__a, __b);
  }
  
 -__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
 -vld4_s32 (const int32_t * __a)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmq_f64 (float64x2_t a, float64x2_t b)
++vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  int32x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31854,14 +33683,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
 -  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_sminpv2df (a, b);
++  return __builtin_aarch64_fmaxv2df (__a, __b);
  }
  
 -__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
 -vld4_u8 (const uint8_t * __a)
-+__extension__ extern __inline float64_t
++/* vmaxv  */
++
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmqd_f64 (float64x2_t a)
++vmaxv_f32 (float32x2_t __a)
  {
 -  uint8x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31871,14 +33702,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
 -  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2df (a);
++  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
  }
  
 -__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
 -vld4_u16 (const uint16_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnms_f32 (float32x2_t a)
++vmaxv_s8 (int8x8_t __a)
  {
 -  uint16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31888,16 +33719,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
 -  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
++  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
  }
  
 -__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
 -vld4_u32 (const uint32_t * __a)
-+/* vmaxnm  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
++vmaxv_s16 (int16x4_t __a)
  {
 -  uint32x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31907,14 +33736,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
 -  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_fmaxv2sf (__a, __b);
++  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
  }
  
 -__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
 -vld4_f16 (const float16_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f64 (float64x1_t __a, float64x1_t __b)
++vmaxv_s32 (int32x2_t __a)
  {
 -  float16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31924,16 +33753,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
 -  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
 -  return ret;
-+  return (float64x1_t)
-+    { __builtin_aarch64_fmaxdf (vget_lane_f64 (__a, 0),
-+				vget_lane_f64 (__b, 0)) };
++  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
  }
  
 -__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 -vld4_f32 (const float32_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
++vmaxv_u8 (uint8x8_t __a)
  {
 -  float32x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31943,14 +33770,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
 -  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_fmaxv4sf (__a, __b);
++  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
  }
  
 -__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 -vld4q_s8 (const int8_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
++vmaxv_u16 (uint16x4_t __a)
  {
 -  int8x16x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31960,16 +33787,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
 -  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_fmaxv2df (__a, __b);
++  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
  }
  
 -__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
 -vld4q_p8 (const poly8_t * __a)
-+/* vmaxv  */
-+
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_f32 (float32x2_t __a)
++vmaxv_u32 (uint32x2_t __a)
  {
 -  poly8x16x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31979,14 +33804,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
 -  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
++  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
  }
  
 -__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_s16 (const int16_t * __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_s8 (int8x8_t __a)
++vmaxvq_f32 (float32x4_t __a)
  {
 -  int16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -31996,14 +33821,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
 -  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
++  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
  }
  
 -__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_p16 (const poly16_t * __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_s16 (int16x4_t __a)
++vmaxvq_f64 (float64x2_t __a)
  {
 -  poly16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32013,14 +33838,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
 -  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
++  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
  }
  
 -__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
 -vld4q_s32 (const int32_t * __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_s32 (int32x2_t __a)
++vmaxvq_s8 (int8x16_t __a)
  {
 -  int32x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32030,14 +33855,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
 -  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
++  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
  }
  
 -__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
 -vld4q_s64 (const int64_t * __a)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_u8 (uint8x8_t __a)
++vmaxvq_s16 (int16x8_t __a)
  {
 -  int64x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32047,14 +33872,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
 -  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
++  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
  }
  
 -__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
 -vld4q_u8 (const uint8_t * __a)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_u16 (uint16x4_t __a)
++vmaxvq_s32 (int32x4_t __a)
  {
 -  uint8x16x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32064,14 +33889,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
 -  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
++  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
  }
  
 -__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_u16 (const uint16_t * __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_u32 (uint32x2_t __a)
++vmaxvq_u8 (uint8x16_t __a)
  {
 -  uint16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32081,14 +33906,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
 -  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
++  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
  }
  
 -__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
 -vld4q_u32 (const uint32_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_f32 (float32x4_t __a)
++vmaxvq_u16 (uint16x8_t __a)
  {
 -  uint32x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32098,14 +33923,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
 -  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
++  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
  }
  
 -__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
 -vld4q_u64 (const uint64_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_f64 (float64x2_t __a)
++vmaxvq_u32 (uint32x4_t __a)
  {
 -  uint64x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32115,14 +33940,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
 -  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
++  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
  }
  
 -__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_f16 (const float16_t * __a)
-+__extension__ extern __inline int8_t
++/* vmaxnmv  */
++
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_s8 (int8x16_t __a)
++vmaxnmv_f32 (float32x2_t __a)
  {
 -  float16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32132,14 +33959,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
 -  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
++  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
  }
  
 -__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 -vld4q_f32 (const float32_t * __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_s16 (int16x8_t __a)
++vmaxnmvq_f32 (float32x4_t __a)
  {
 -  float32x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32149,14 +33976,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
 -  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
++  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
  }
  
 -__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
 -vld4q_f64 (const float64_t * __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_s32 (int32x4_t __a)
++vmaxnmvq_f64 (float64x2_t __a)
  {
 -  float64x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -32166,16 +33993,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
 -  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
++  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
  }
  
 -/* vldn_dup */
--
++/* vmin  */
+ 
 -__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
 -vld2_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_u8 (uint8x16_t __a)
++vmin_f32 (float32x2_t __a, float32x2_t __b)
  {
 -  int8x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32183,14 +34011,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
 -  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
++  return __builtin_aarch64_smin_nanv2sf (__a, __b);
  }
  
 -__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
 -vld2_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_u16 (uint16x8_t __a)
++vmin_f64 (float64x1_t __a, float64x1_t __b)
  {
 -  int16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32198,14 +34026,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
 -  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
++    return (float64x1_t)
++	  { __builtin_aarch64_smin_nandf (vget_lane_f64 (__a, 0),
++					  vget_lane_f64 (__b, 0)) };
  }
  
 -__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
 -vld2_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_u32 (uint32x4_t __a)
++vmin_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  int32x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32213,16 +34043,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
 -  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
++  return __builtin_aarch64_sminv8qi (__a, __b);
  }
  
 -__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
 -vld2_dup_f16 (const float16_t * __a)
-+/* vmaxnmv  */
-+
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmv_f32 (float32x2_t __a)
++vmin_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  float16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32230,14 +34058,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
 -  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
++  return __builtin_aarch64_sminv4hi (__a, __b);
  }
  
 -__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 -vld2_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmvq_f32 (float32x4_t __a)
++vmin_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  float32x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32245,14 +34073,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
 -  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
++  return __builtin_aarch64_sminv2si (__a, __b);
  }
  
 -__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__))
 -vld2_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmvq_f64 (float64x2_t __a)
++vmin_u8 (uint8x8_t __a, uint8x8_t __b)
  {
 -  float64x1x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32260,16 +34088,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
 -  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
 -  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
++  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
++						 (int8x8_t) __b);
  }
  
 -__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
 -vld2_dup_u8 (const uint8_t * __a)
-+/* vmin  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_f32 (float32x2_t __a, float32x2_t __b)
++vmin_u16 (uint16x4_t __a, uint16x4_t __b)
  {
 -  uint8x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32277,14 +34104,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
 -  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_smin_nanv2sf (__a, __b);
++  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
++						  (int16x4_t) __b);
  }
  
 -__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
 -vld2_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_f64 (float64x1_t __a, float64x1_t __b)
++vmin_u32 (uint32x2_t __a, uint32x2_t __b)
  {
 -  uint16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32292,16 +34120,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
 -  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
 -  return ret;
-+    return (float64x1_t)
-+	  { __builtin_aarch64_smin_nandf (vget_lane_f64 (__a, 0),
-+					  vget_lane_f64 (__b, 0)) };
++  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
++						  (int32x2_t) __b);
  }
  
 -__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
 -vld2_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_s8 (int8x8_t __a, int8x8_t __b)
++vminq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  uint32x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32309,14 +34136,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
 -  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_sminv8qi (__a, __b);
++  return __builtin_aarch64_smin_nanv4sf (__a, __b);
  }
  
 -__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
 -vld2_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_s16 (int16x4_t __a, int16x4_t __b)
++vminq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  poly8x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32324,14 +34151,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
 -  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_sminv4hi (__a, __b);
++  return __builtin_aarch64_smin_nanv2df (__a, __b);
  }
  
 -__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
 -vld2_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_s32 (int32x2_t __a, int32x2_t __b)
++vminq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  poly16x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32339,14 +34166,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
 -  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_sminv2si (__a, __b);
++  return __builtin_aarch64_sminv16qi (__a, __b);
  }
  
 -__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
 -vld2_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_u8 (uint8x8_t __a, uint8x8_t __b)
++vminq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  int64x1x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32354,15 +34181,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
 -  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
 -  return ret;
-+  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
-+						 (int8x8_t) __b);
++  return __builtin_aarch64_sminv8hi (__a, __b);
  }
  
 -__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
 -vld2_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_u16 (uint16x4_t __a, uint16x4_t __b)
++vminq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  uint64x1x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32370,15 +34196,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
 -  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
 -  return ret;
-+  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
-+						  (int16x4_t) __b);
++  return __builtin_aarch64_sminv4si (__a, __b);
  }
  
 -__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_u32 (uint32x2_t __a, uint32x2_t __b)
++vminq_u8 (uint8x16_t __a, uint8x16_t __b)
  {
 -  int8x16x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32386,22 +34211,33 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
 -  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
 -  return ret;
-+  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
-+						  (int32x2_t) __b);
++  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
++						   (int8x16_t) __b);
 +}
 +
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_f32 (float32x4_t __a, float32x4_t __b)
++vminq_u16 (uint16x8_t __a, uint16x8_t __b)
 +{
-+  return __builtin_aarch64_smin_nanv4sf (__a, __b);
++  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
++						  (int16x8_t) __b);
  }
  
 -__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_f64 (float64x2_t __a, float64x2_t __b)
++vminq_u32 (uint32x4_t __a, uint32x4_t __b)
++{
++  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
++						  (int32x4_t) __b);
++}
++
++/* vminnm  */
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vminnm_f32 (float32x2_t __a, float32x2_t __b)
  {
 -  poly8x16x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32409,14 +34245,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
 -  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_smin_nanv2df (__a, __b);
++  return __builtin_aarch64_fminv2sf (__a, __b);
  }
  
 -__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_s8 (int8x16_t __a, int8x16_t __b)
++vminnm_f64 (float64x1_t __a, float64x1_t __b)
  {
 -  int16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32424,14 +34260,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
 -  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_sminv16qi (__a, __b);
++  return (float64x1_t)
++    { __builtin_aarch64_fmindf (vget_lane_f64 (__a, 0),
++				vget_lane_f64 (__b, 0)) };
  }
  
 -__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_s16 (int16x8_t __a, int16x8_t __b)
++vminnmq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  poly16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32439,14 +34277,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
 -  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_sminv8hi (__a, __b);
++  return __builtin_aarch64_fminv4sf (__a, __b);
  }
  
 -__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_s32 (int32x4_t __a, int32x4_t __b)
++vminnmq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  int32x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32454,14 +34292,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
 -  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_sminv4si (__a, __b);
++  return __builtin_aarch64_fminv2df (__a, __b);
  }
  
 -__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline uint8x16_t
++/* vminv  */
++
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_u8 (uint8x16_t __a, uint8x16_t __b)
++vminv_f32 (float32x2_t __a)
  {
 -  int64x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32469,15 +34309,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
 -  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
 -  return ret;
-+  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
-+						   (int8x16_t) __b);
++  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
  }
  
 -__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_u16 (uint16x8_t __a, uint16x8_t __b)
++vminv_s8 (int8x8_t __a)
  {
 -  uint8x16x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32485,15 +34324,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
 -  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
 -  return ret;
-+  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
-+						  (int16x8_t) __b);
++  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
  }
  
 -__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_u32 (uint32x4_t __a, uint32x4_t __b)
++vminv_s16 (int16x4_t __a)
  {
 -  uint16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32501,17 +34339,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
 -  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
 -  return ret;
-+  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
-+						  (int32x4_t) __b);
++  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
  }
  
 -__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_u32 (const uint32_t * __a)
-+/* vminnm  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f32 (float32x2_t __a, float32x2_t __b)
++vminv_s32 (int32x2_t __a)
  {
 -  uint32x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32519,14 +34354,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
 -  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fminv2sf (__a, __b);
++  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
  }
  
 -__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f64 (float64x1_t __a, float64x1_t __b)
++vminv_u8 (uint8x8_t __a)
  {
 -  uint64x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32534,16 +34369,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
 -  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
 -  return ret;
-+  return (float64x1_t)
-+    { __builtin_aarch64_fmindf (vget_lane_f64 (__a, 0),
-+				vget_lane_f64 (__b, 0)) };
++  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
  }
  
 -__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f32 (float32x4_t __a, float32x4_t __b)
++vminv_u16 (uint16x4_t __a)
  {
 -  float16x8x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32551,14 +34384,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
 -  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fminv4sf (__a, __b);
++  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
  }
  
 -__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f64 (float64x2_t __a, float64x2_t __b)
++vminv_u32 (uint32x2_t __a)
  {
 -  float32x4x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32566,16 +34399,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
 -  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_fminv2df (__a, __b);
++  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
  }
  
 -__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
 -vld2q_dup_f64 (const float64_t * __a)
-+/* vminv  */
-+
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_f32 (float32x2_t __a)
++vminvq_f32 (float32x4_t __a)
  {
 -  float64x2x2_t ret;
 -  __builtin_aarch64_simd_oi __o;
@@ -32583,14 +34414,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
 -  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
++  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
  }
  
 -__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
 -vld3_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_s8 (int8x8_t __a)
++vminvq_f64 (float64x2_t __a)
  {
 -  int64x1x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32599,14 +34430,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
 -  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
++  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
  }
  
 -__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
 -vld3_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_s16 (int16x4_t __a)
++vminvq_s8 (int8x16_t __a)
  {
 -  uint64x1x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32615,14 +34446,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
 -  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
++  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
  }
  
 -__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__))
 -vld3_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_s32 (int32x2_t __a)
++vminvq_s16 (int16x8_t __a)
  {
 -  float64x1x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32631,14 +34462,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
 -  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
++  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
  }
  
 -__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
 -vld3_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_u8 (uint8x8_t __a)
++vminvq_s32 (int32x4_t __a)
  {
 -  int8x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32647,14 +34478,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
 -  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
++  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
  }
  
 -__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
 -vld3_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_u16 (uint16x4_t __a)
++vminvq_u8 (uint8x16_t __a)
  {
 -  poly8x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32663,14 +34494,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
 -  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
++  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
  }
  
 -__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
 -vld3_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_u32 (uint32x2_t __a)
++vminvq_u16 (uint16x8_t __a)
  {
 -  int16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32679,14 +34510,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
 -  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
++  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
  }
  
 -__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
 -vld3_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_f32 (float32x4_t __a)
++vminvq_u32 (uint32x4_t __a)
  {
 -  poly16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32695,14 +34526,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
 -  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
++  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
  }
  
 -__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
 -vld3_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline float64_t
++/* vminnmv  */
++
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_f64 (float64x2_t __a)
++vminnmv_f32 (float32x2_t __a)
  {
 -  int32x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32711,14 +34544,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
 -  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
++  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
  }
  
 -__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
 -vld3_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_s8 (int8x16_t __a)
++vminnmvq_f32 (float32x4_t __a)
  {
 -  uint8x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32727,14 +34560,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
 -  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
++  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
  }
  
 -__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
 -vld3_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_s16 (int16x8_t __a)
++vminnmvq_f64 (float64x2_t __a)
  {
 -  uint16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32743,14 +34576,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
 -  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
++  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
  }
  
 -__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
 -vld3_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline int32_t
++/* vmla */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_s32 (int32x4_t __a)
++vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
  {
 -  uint32x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32759,14 +34594,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
 -  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
++  return a + b * c;
  }
  
 -__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
 -vld3_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_u8 (uint8x16_t __a)
++vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
  {
 -  float16x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32775,14 +34610,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
 -  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
++  return __a + __b * __c;
  }
  
 -__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
 -vld3_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_u16 (uint16x8_t __a)
++vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
  {
 -  float32x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32791,14 +34626,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
 -  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
++  return a + b * c;
  }
  
 -__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_u32 (uint32x4_t __a)
++vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
  {
 -  int8x16x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32807,16 +34642,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
 -  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
++  return a + b * c;
  }
  
 -__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_p8 (const poly8_t * __a)
-+/* vminnmv  */
++/* vmla_lane  */
 +
-+__extension__ extern __inline float32_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmv_f32 (float32x2_t __a)
++vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
++	       float32x2_t __c, const int __lane)
  {
 -  poly8x16x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32825,14 +34661,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
 -  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmvq_f32 (float32x4_t __a)
++vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
++		int16x4_t __c, const int __lane)
  {
 -  int16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32841,14 +34678,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 -  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmvq_f64 (float64x2_t __a)
++vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
++		int32x2_t __c, const int __lane)
  {
 -  poly16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32857,16 +34695,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 -  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 -  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_s32 (const int32_t * __a)
-+/* vmla */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
++vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
++		uint16x4_t __c, const int __lane)
  {
 -  int32x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32875,14 +34712,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
 -  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
 -  return ret;
-+  return a + b * c;
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
++vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
++	       uint32x2_t __c, const int __lane)
  {
 -  int64x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32891,14 +34729,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
 -  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
 -  return ret;
-+  return __a + __b * __c;
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline float32x4_t
++/* vmla_laneq  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
++vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
++	        float32x4_t __c, const int __lane)
  {
 -  uint8x16x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32907,14 +34748,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
 -  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
 -  return ret;
-+  return a + b * c;
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
++vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
++		int16x8_t __c, const int __lane)
  {
 -  uint16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32923,17 +34765,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
 -  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
 -  return ret;
-+  return a + b * c;
++  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_u32 (const uint32_t * __a)
-+/* vmla_lane  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x2_t __c, const int __lane)
++vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
++		int32x4_t __c, const int __lane)
  {
 -  uint32x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32947,10 +34787,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x4_t __c, const int __lane)
++vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
++		uint16x8_t __c, const int __lane)
  {
 -  uint64x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32964,10 +34804,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x2_t __c, const int __lane)
++vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
++		uint32x4_t __c, const int __lane)
  {
 -  float16x8x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32981,10 +34821,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline uint16x4_t
++/* vmlaq_lane  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x4_t __c, const int __lane)
++vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
++		float32x2_t __c, const int __lane)
  {
 -  float32x4x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -32998,10 +34840,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
 -vld3q_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
-+	       uint32x2_t __c, const int __lane)
++vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
++		int16x4_t __c, const int __lane)
  {
 -  float64x2x3_t ret;
 -  __builtin_aarch64_simd_ci __o;
@@ -33015,12 +34857,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
 -vld4_dup_s64 (const int64_t * __a)
-+/* vmla_laneq  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
-+	        float32x4_t __c, const int __lane)
++vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
++		int32x2_t __c, const int __lane)
  {
 -  int64x1x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33035,10 +34875,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
 -vld4_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x8_t __c, const int __lane)
++vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
++		uint16x4_t __c, const int __lane)
  {
 -  uint64x1x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33053,10 +34893,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__))
 -vld4_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x4_t __c, const int __lane)
++vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
++		uint32x2_t __c, const int __lane)
  {
 -  float64x1x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33071,10 +34911,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
 -vld4_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint16x4_t
++  /* vmlaq_laneq  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x8_t __c, const int __lane)
++vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
++		 float32x4_t __c, const int __lane)
  {
 -  int8x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33089,10 +34931,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
 -vld4_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
-+		uint32x4_t __c, const int __lane)
++vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
++		int16x8_t __c, const int __lane)
  {
 -  poly8x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33107,12 +34949,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
 -vld4_dup_s16 (const int16_t * __a)
-+/* vmlaq_lane  */
-+
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
-+		float32x2_t __c, const int __lane)
++vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
++		int32x4_t __c, const int __lane)
  {
 -  int16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33127,10 +34967,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
 -vld4_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x4_t __c, const int __lane)
++vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
++		uint16x8_t __c, const int __lane)
  {
 -  poly16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33145,10 +34985,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
 -vld4_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x2_t __c, const int __lane)
++vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
++		uint32x4_t __c, const int __lane)
  {
 -  int32x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33163,10 +35003,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
 -vld4_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline uint16x8_t
++/* vmls  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x4_t __c, const int __lane)
++vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
  {
 -  uint8x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33176,12 +35017,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
 -  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
 -  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return a - b * c;
  }
  
 -__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
 -vld4_dup_u16 (const uint16_t * __a)
--{
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+ {
 -  uint16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
@@ -33190,22 +35034,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
 -  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
 -  return ret;
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x2_t __c, const int __lane)
-+{
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return __a - __b * __c;
  }
  
 -__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
 -vld4_dup_u32 (const uint32_t * __a)
-+  /* vmlaq_laneq  */
-+
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-+		 float32x4_t __c, const int __lane)
++vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
  {
 -  uint32x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33215,15 +35051,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
 -  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
 -  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return a - b * c;
  }
  
 -__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
 -vld4_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x8_t __c, const int __lane)
++vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
  {
 -  float16x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33233,15 +35068,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
 -  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
 -  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return a - b * c;
  }
  
 -__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
 -vld4_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline int32x4_t
++/* vmls_lane  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x4_t __c, const int __lane)
++vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
++	       float32x2_t __c, const int __lane)
  {
 -  float32x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33251,15 +35088,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
 -  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
 -  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x8_t __c, const int __lane)
++vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
++		int16x4_t __c, const int __lane)
  {
 -  int8x16x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33269,15 +35106,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
 -  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
 -  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x4_t __c, const int __lane)
++vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
++		int32x2_t __c, const int __lane)
  {
 -  poly8x16x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33287,16 +35124,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
 -  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
 -  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_s16 (const int16_t * __a)
-+/* vmls  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
++vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
++		uint16x4_t __c, const int __lane)
  {
 -  int16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33306,14 +35142,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
 -  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
 -  return ret;
-+  return a - b * c;
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
++vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
++	       uint32x2_t __c, const int __lane)
  {
 -  poly16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33323,14 +35160,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
 -  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
 -  return ret;
-+  return __a - __b * __c;
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline float32x4_t
++/* vmls_laneq  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
++vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
++	       float32x4_t __c, const int __lane)
  {
 -  int32x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33340,14 +35180,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
 -  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
 -  return ret;
-+  return a - b * c;
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
++vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
++		int16x8_t __c, const int __lane)
  {
 -  int64x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33357,17 +35198,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
 -  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
 -  return ret;
-+  return a - b * c;
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
  
 -__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_u8 (const uint8_t * __a)
-+/* vmls_lane  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x2_t __c, const int __lane)
++vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
++		int32x4_t __c, const int __lane)
  {
 -  uint8x16x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33382,10 +35221,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x4_t __c, const int __lane)
++vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
++		uint16x8_t __c, const int __lane)
  {
 -  uint16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33400,10 +35239,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x2_t __c, const int __lane)
++vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
++		uint32x4_t __c, const int __lane)
  {
 -  uint32x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33418,10 +35257,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint16x4_t
++/* vmlsq_lane  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x4_t __c, const int __lane)
++vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
++		float32x2_t __c, const int __lane)
  {
 -  uint64x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33436,10 +35277,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
-+	       uint32x2_t __c, const int __lane)
++vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
++		int16x4_t __c, const int __lane)
  {
 -  float16x8x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33454,12 +35295,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_f32 (const float32_t * __a)
-+/* vmls_laneq  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x4_t __c, const int __lane)
++vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
++		int32x2_t __c, const int __lane)
  {
 -  float32x4x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33474,10 +35313,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
 -vld4q_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x8_t __c, const int __lane)
++vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
++		uint16x4_t __c, const int __lane)
  {
 -  float64x2x4_t ret;
 -  __builtin_aarch64_simd_xi __o;
@@ -33514,10 +35353,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
 -  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
 -  return __b;								   \
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x4_t __c, const int __lane)
++vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
++		uint32x2_t __c, const int __lane)
 +{
 +  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
@@ -33548,25 +35387,40 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		 u32, int32x4_t)
 -__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
 -		 u64, int64x2_t)
-+__extension__ extern __inline uint16x4_t
++  /* vmlsq_laneq  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x8_t __c, const int __lane)
++vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
++		float32x4_t __c, const int __lane)
 +{
 +  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 +}
  
 -#undef __LD2_LANE_FUNC
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
-+		uint32x4_t __c, const int __lane)
++vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
++		int16x8_t __c, const int __lane)
 +{
 +  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
 +}
  
 -/* vld2q_lane */
-+/* vmlsq_lane  */
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
++		int32x4_t __c, const int __lane)
++{
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++}
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
++		uint16x8_t __c, const int __lane)
++{
++  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++}
  
 -#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
 -__extension__ static __inline intype __attribute__ ((__always_inline__))   \
@@ -33581,10 +35435,10 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);	   \
 -  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);	   \
 -  return ret;								   \
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
-+		float32x2_t __c, const int __lane)
++vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
++		uint32x4_t __c, const int __lane)
 +{
 +  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
  }
@@ -33602,30 +35456,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
 -__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
 -__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x4_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+}
++/* vmov_n_  */
  
 -#undef __LD2_LANE_FUNC
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x2_t __c, const int __lane)
++vmov_n_f16 (float16_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return vdup_n_f16 (__a);
 +}
  
 -/* vld3_lane */
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x4_t __c, const int __lane)
++vmov_n_f32 (float32_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return vdup_n_f32 (__a);
 +}
  
 -#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
@@ -33656,12 +35502,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
 -  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
 -  return __b;								   \
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x2_t __c, const int __lane)
++vmov_n_f64 (float64_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return (float64x1_t) {__a};
  }
  
 -__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
@@ -33690,24 +35535,27 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		 u32, int32x4_t)
 -__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
 -		 u64, int64x2_t)
-+  /* vmlsq_laneq  */
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmov_n_p8 (poly8_t __a)
++{
++  return vdup_n_p8 (__a);
++}
  
 -#undef __LD3_LANE_FUNC
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-+		float32x4_t __c, const int __lane)
++vmov_n_p16 (poly16_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return vdup_n_p16 (__a);
 +}
  
 -/* vld3q_lane */
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x8_t __c, const int __lane)
++vmov_n_p64 (poly64_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return vdup_n_p64 (__a);
 +}
  
 -#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
@@ -33725,19 +35573,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);	   \
 -  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);	   \
 -  return ret;								   \
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x4_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+}
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x8_t __c, const int __lane)
++vmov_n_s8 (int8_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return vdup_n_s8 (__a);
  }
  
 -__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
@@ -33753,23 +35593,27 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
 -__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
 -__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x4_t __c, const int __lane)
++vmov_n_s16 (int16_t __a)
 +{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
++  return vdup_n_s16 (__a);
 +}
  
 -#undef __LD3_LANE_FUNC
-+/* vmov_n_  */
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmov_n_s32 (int32_t __a)
++{
++  return vdup_n_s32 (__a);
++}
  
 -/* vld4_lane */
-+__extension__ extern __inline float16x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f16 (float16_t __a)
++vmov_n_s64 (int64_t __a)
 +{
-+  return vdup_n_f16 (__a);
++  return (int64x1_t) {__a};
 +}
  
 -#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
@@ -33806,19 +35650,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
 -  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
 -  return __b;								   \
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f32 (float32_t __a)
++vmov_n_u8 (uint8_t __a)
 +{
-+  return vdup_n_f32 (__a);
++  return vdup_n_u8 (__a);
  }
  
 -/* vld4q_lane */
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f64 (float64_t __a)
++vmov_n_u16 (uint16_t __a)
 +{
-+  return (float64x1_t) {__a};
++    return vdup_n_u16 (__a);
 +}
  
 -__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
@@ -33847,48 +35691,55 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		 u32, int32x4_t)
 -__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
 -		 u64, int64x2_t)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_p8 (poly8_t __a)
++vmov_n_u32 (uint32_t __a)
 +{
-+  return vdup_n_p8 (__a);
++   return vdup_n_u32 (__a);
 +}
 +
-+__extension__ extern __inline poly16x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_p16 (poly16_t __a)
++vmov_n_u64 (uint64_t __a)
 +{
-+  return vdup_n_p16 (__a);
++  return (uint64x1_t) {__a};
 +}
 +
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s8 (int8_t __a)
++vmovq_n_f16 (float16_t __a)
 +{
-+  return vdup_n_s8 (__a);
++  return vdupq_n_f16 (__a);
 +}
 +
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s16 (int16_t __a)
++vmovq_n_f32 (float32_t __a)
 +{
-+  return vdup_n_s16 (__a);
++  return vdupq_n_f32 (__a);
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmovq_n_f64 (float64_t __a)
++{
++  return vdupq_n_f64 (__a);
 +}
  
 -#undef __LD4_LANE_FUNC
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s32 (int32_t __a)
++vmovq_n_p8 (poly8_t __a)
 +{
-+  return vdup_n_s32 (__a);
++  return vdupq_n_p8 (__a);
 +}
  
 -/* vld4q_lane */
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s64 (int64_t __a)
++vmovq_n_p16 (poly16_t __a)
 +{
-+  return (int64x1_t) {__a};
++  return vdupq_n_p16 (__a);
 +}
  
 -#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
@@ -33908,11 +35759,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);	   \
 -  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);	   \
 -  return ret;								   \
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u8 (uint8_t __a)
++vmovq_n_p64 (poly64_t __a)
 +{
-+  return vdup_n_u8 (__a);
++  return vdupq_n_p64 (__a);
  }
  
 -__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
@@ -33928,170 +35779,174 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
 -__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
 -__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u16 (uint16_t __a)
++vmovq_n_s8 (int8_t __a)
 +{
-+    return vdup_n_u16 (__a);
++  return vdupq_n_s8 (__a);
 +}
  
 -#undef __LD4_LANE_FUNC
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u32 (uint32_t __a)
++vmovq_n_s16 (int16_t __a)
 +{
-+   return vdup_n_u32 (__a);
++  return vdupq_n_s16 (__a);
 +}
  
 -/* vmax */
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u64 (uint64_t __a)
++vmovq_n_s32 (int32_t __a)
 +{
-+  return (uint64x1_t) {__a};
++  return vdupq_n_s32 (__a);
 +}
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmax_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float16x8_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f16 (float16_t __a)
++vmovq_n_s64 (int64_t __a)
  {
 -  return __builtin_aarch64_smax_nanv2sf (__a, __b);
-+  return vdupq_n_f16 (__a);
++  return vdupq_n_s64 (__a);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vmax_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f32 (float32_t __a)
++vmovq_n_u8 (uint8_t __a)
  {
 -  return __builtin_aarch64_smaxv8qi (__a, __b);
-+  return vdupq_n_f32 (__a);
++  return vdupq_n_u8 (__a);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmax_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f64 (float64_t __a)
++vmovq_n_u16 (uint16_t __a)
  {
 -  return __builtin_aarch64_smaxv4hi (__a, __b);
-+  return vdupq_n_f64 (__a);
++  return vdupq_n_u16 (__a);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmax_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_p8 (poly8_t __a)
++vmovq_n_u32 (uint32_t __a)
  {
 -  return __builtin_aarch64_smaxv2si (__a, __b);
-+  return vdupq_n_p8 (__a);
++  return vdupq_n_u32 (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vmax_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline poly16x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_p16 (poly16_t __a)
++vmovq_n_u64 (uint64_t __a)
  {
 -  return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a,
 -						 (int8x8_t) __b);
-+  return vdupq_n_p16 (__a);
++  return vdupq_n_u64 (__a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmax_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline int8x16_t
++/* vmul_lane  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s8 (int8_t __a)
++vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
  {
 -  return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a,
 -						  (int16x4_t) __b);
-+  return vdupq_n_s8 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmax_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s16 (int16_t __a)
++vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
  {
 -  return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a,
 -						  (int32x2_t) __b);
-+  return vdupq_n_s16 (__a);
++  return __a * __b;
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmaxq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s32 (int32_t __a)
++vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
  {
 -  return __builtin_aarch64_smax_nanv4sf (__a, __b);
-+  return vdupq_n_s32 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmaxq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s64 (int64_t __a)
++vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
  {
 -  return __builtin_aarch64_smax_nanv2df (__a, __b);
-+  return vdupq_n_s64 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vmaxq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u8 (uint8_t __a)
++vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
  {
 -  return __builtin_aarch64_smaxv16qi (__a, __b);
-+  return vdupq_n_u8 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmaxq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u16 (uint16_t __a)
++vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
  {
 -  return __builtin_aarch64_smaxv8hi (__a, __b);
-+  return vdupq_n_u16 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmaxq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline uint32x4_t
++/* vmuld_lane  */
++
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u32 (uint32_t __a)
++vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
  {
 -  return __builtin_aarch64_smaxv4si (__a, __b);
-+  return vdupq_n_u32 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u64 (uint64_t __a)
++vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
  {
 -  return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a,
 -						   (int8x16_t) __b);
-+  return vdupq_n_u64 (__a);
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
-+/* vmul_lane  */
++/* vmuls_lane  */
 +
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
++vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
  {
 -  return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a,
 -						  (int16x8_t) __b);
@@ -34100,21 +35955,23 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
++vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
  {
 -  return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a,
 -						  (int32x4_t) __b);
-+  return __a * __b;
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
 -/* vmulx */
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmulx_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int16x4_t
++/* vmul_laneq  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
++vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
  {
 -  return __builtin_aarch64_fmulxv2sf (__a, __b);
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34122,9 +35979,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmulxq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
++vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
  {
 -  return __builtin_aarch64_fmulxv4sf (__a, __b);
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34132,9 +35989,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmulx_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
++vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
  {
 -  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34142,9 +35999,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmulxq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
++vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
  {
 -  return __builtin_aarch64_fmulxv2df (__a, __b);
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34152,11 +36009,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmulxs_f32 (float32_t __a, float32_t __b)
-+/* vmuld_lane  */
-+
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
++vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
  {
 -  return __builtin_aarch64_fmulxsf (__a, __b);
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34164,9 +36019,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmulxd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
++vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
  {
 -  return __builtin_aarch64_fmulxdf (__a, __b);
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34174,21 +36029,23 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
-+/* vmuls_lane  */
++/* vmul_n  */
 +
-+__extension__ extern __inline float32_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
++vmul_n_f64  (float64x1_t __a, float64_t __b)
  {
 -  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
-+__extension__ extern __inline float32_t
++/* vmulq_lane  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
++vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
  {
 -  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34196,21 +36053,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
-+/* vmul_laneq  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
++vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
  {
 -  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  __AARCH64_LANE_CHECK (__a, __lane);
++  return __a * __b[0];
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
++vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
  {
 -  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34218,9 +36074,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
++vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
  {
 -  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34228,9 +36084,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
++vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
  {
 -  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34238,9 +36094,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
++vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
  {
 -  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34248,9 +36104,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
-+__extension__ extern __inline uint32x2_t
++/* vmulq_laneq  */
++
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
++vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
  {
 -  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34258,23 +36116,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
-+/* vmul_n  */
-+
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_f64  (float64x1_t __a, float64_t __b)
++vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
  {
 -  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
-+  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
-+/* vmulq_lane  */
-+
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
++vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
  {
 -  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
@@ -34282,245 +36136,250 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
++vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
  {
 -  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
-+  __AARCH64_LANE_CHECK (__a, __lane);
-+  return __a * __b[0];
++  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
++vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
  {
 -  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
 -/* vpmax  */
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
++vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
 +{
 +  return __a * __aarch64_vget_lane_any (__b, __lane);
 +}
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vpmax_s8 (int8x8_t a, int8x8_t b)
-+__extension__ extern __inline uint16x8_t
++/* vmul_n.  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
++vmul_n_f32 (float32x2_t __a, float32_t __b)
  {
 -  return __builtin_aarch64_smaxpv8qi (a, b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vpmax_s16 (int16x4_t a, int16x4_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
++vmulq_n_f32 (float32x4_t __a, float32_t __b)
  {
 -  return __builtin_aarch64_smaxpv4hi (a, b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vpmax_s32 (int32x2_t a, int32x2_t b)
-+/* vmulq_laneq  */
-+
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
++vmulq_n_f64 (float64x2_t __a, float64_t __b)
  {
 -  return __builtin_aarch64_smaxpv2si (a, b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vpmax_u8 (uint8x8_t a, uint8x8_t b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
++vmul_n_s16 (int16x4_t __a, int16_t __b)
  {
 -  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
 -						  (int8x8_t) b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vpmax_u16 (uint16x4_t a, uint16x4_t b)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
++vmulq_n_s16 (int16x8_t __a, int16_t __b)
  {
 -  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
 -						   (int16x4_t) b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vpmax_u32 (uint32x2_t a, uint32x2_t b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
++vmul_n_s32 (int32x2_t __a, int32_t __b)
  {
 -  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
 -						   (int32x2_t) b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vpmaxq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
++vmulq_n_s32 (int32x4_t __a, int32_t __b)
  {
 -  return __builtin_aarch64_smaxpv16qi (a, b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vpmaxq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
++vmul_n_u16 (uint16x4_t __a, uint16_t __b)
  {
 -  return __builtin_aarch64_smaxpv8hi (a, b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
++  return __a * __b;
+ }
+ 
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vpmaxq_s32 (int32x4_t a, int32x4_t b)
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
++{
++  return __a * __b;
 +}
 +
-+/* vmul_n.  */
-+
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_f32 (float32x2_t __a, float32_t __b)
++vmul_n_u32 (uint32x2_t __a, uint32_t __b)
 +{
 +  return __a * __b;
 +}
 +
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_f32 (float32x4_t __a, float32_t __b)
++vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
 +{
 +  return __a * __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vpmaxq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline float64x2_t
++}
++
++/* vmvn  */
++
++__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_f64 (float64x2_t __a, float64_t __b)
++vmvn_p8 (poly8x8_t __a)
  {
 -  return __builtin_aarch64_smaxpv4si (a, b);
-+  return __a * __b;
++  return (poly8x8_t) ~((int8x8_t) __a);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_s16 (int16x4_t __a, int16_t __b)
++vmvn_s8 (int8x8_t __a)
  {
 -  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
 -						    (int8x16_t) b);
-+  return __a * __b;
++  return ~__a;
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_s16 (int16x8_t __a, int16_t __b)
++vmvn_s16 (int16x4_t __a)
  {
 -  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
 -						   (int16x8_t) b);
-+  return __a * __b;
++  return ~__a;
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_s32 (int32x2_t __a, int32_t __b)
++vmvn_s32 (int32x2_t __a)
  {
 -  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
 -						   (int32x4_t) b);
-+  return __a * __b;
++  return ~__a;
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vpmax_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_s32 (int32x4_t __a, int32_t __b)
++vmvn_u8 (uint8x8_t __a)
  {
 -  return __builtin_aarch64_smax_nanpv2sf (a, b);
-+  return __a * __b;
++  return ~__a;
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vpmaxq_f32 (float32x4_t a, float32x4_t b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
++vmvn_u16 (uint16x4_t __a)
  {
 -  return __builtin_aarch64_smax_nanpv4sf (a, b);
-+  return __a * __b;
++  return ~__a;
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vpmaxq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
++vmvn_u32 (uint32x2_t __a)
  {
 -  return __builtin_aarch64_smax_nanpv2df (a, b);
-+  return __a * __b;
++  return ~__a;
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vpmaxqd_f64 (float64x2_t a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
++vmvnq_p8 (poly8x16_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
-+  return __a * __b;
++  return (poly8x16_t) ~((int8x16_t) __a);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vpmaxs_f32 (float32x2_t a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
++vmvnq_s8 (int8x16_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
-+  return __a * __b;
++  return ~__a;
  }
  
 -/* vpmaxnm  */
-+/* vmvn  */
- 
+-
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vpmaxnm_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline poly8x8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_p8 (poly8x8_t __a)
++vmvnq_s16 (int16x8_t __a)
  {
 -  return __builtin_aarch64_smaxpv2sf (a, b);
-+  return (poly8x8_t) ~((int8x8_t) __a);
++  return ~__a;
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_s8 (int8x8_t __a)
++vmvnq_s32 (int32x4_t __a)
  {
 -  return __builtin_aarch64_smaxpv4sf (a, b);
 +  return ~__a;
@@ -34528,9 +36387,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_s16 (int16x4_t __a)
++vmvnq_u8 (uint8x16_t __a)
  {
 -  return __builtin_aarch64_smaxpv2df (a, b);
 +  return ~__a;
@@ -34538,9 +36397,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vpmaxnmqd_f64 (float64x2_t a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_s32 (int32x2_t __a)
++vmvnq_u16 (uint16x8_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v2df (a);
 +  return ~__a;
@@ -34548,122 +36407,126 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vpmaxnms_f32 (float32x2_t a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_u8 (uint8x8_t __a)
++vmvnq_u32 (uint32x4_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
 +  return ~__a;
  }
  
 -/* vpmin  */
--
++/* vneg  */
+ 
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vpmin_s8 (int8x8_t a, int8x8_t b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_u16 (uint16x4_t __a)
++vneg_f32 (float32x2_t __a)
  {
 -  return __builtin_aarch64_sminpv8qi (a, b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vpmin_s16 (int16x4_t a, int16x4_t b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_u32 (uint32x2_t __a)
++vneg_f64 (float64x1_t __a)
  {
 -  return __builtin_aarch64_sminpv4hi (a, b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vpmin_s32 (int32x2_t a, int32x2_t b)
-+__extension__ extern __inline poly8x16_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_p8 (poly8x16_t __a)
++vneg_s8 (int8x8_t __a)
  {
 -  return __builtin_aarch64_sminpv2si (a, b);
-+  return (poly8x16_t) ~((int8x16_t) __a);
++  return -__a;
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vpmin_u8 (uint8x8_t a, uint8x8_t b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_s8 (int8x16_t __a)
++vneg_s16 (int16x4_t __a)
  {
 -  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
 -						  (int8x8_t) b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vpmin_u16 (uint16x4_t a, uint16x4_t b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_s16 (int16x8_t __a)
++vneg_s32 (int32x2_t __a)
  {
 -  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
 -						   (int16x4_t) b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vpmin_u32 (uint32x2_t a, uint32x2_t b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_s32 (int32x4_t __a)
++vneg_s64 (int64x1_t __a)
  {
 -  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
 -						   (int32x2_t) b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vpminq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_u8 (uint8x16_t __a)
++vnegq_f32 (float32x4_t __a)
  {
 -  return __builtin_aarch64_sminpv16qi (a, b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vpminq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_u16 (uint16x8_t __a)
++vnegq_f64 (float64x2_t __a)
  {
 -  return __builtin_aarch64_sminpv8hi (a, b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vpminq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_u32 (uint32x4_t __a)
++vnegq_s8 (int8x16_t __a)
  {
 -  return __builtin_aarch64_sminpv4si (a, b);
-+  return ~__a;
++  return -__a;
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vpminq_u8 (uint8x16_t a, uint8x16_t b)
--{
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vnegq_s16 (int16x8_t __a)
+ {
 -  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
 -						    (int8x16_t) b);
--}
-+/* vneg  */
++  return -__a;
+ }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vpminq_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_f32 (float32x2_t __a)
++vnegq_s32 (int32x4_t __a)
  {
 -  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
 -						   (int16x8_t) b);
@@ -34672,9 +36535,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vpminq_u32 (uint32x4_t a, uint32x4_t b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_f64 (float64x1_t __a)
++vnegq_s64 (int64x2_t __a)
  {
 -  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
 -						   (int32x4_t) b);
@@ -34683,820 +36546,827 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vpmin_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline int8x8_t
++/* vpadd  */
++
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s8 (int8x8_t __a)
++vpadd_f32 (float32x2_t __a, float32x2_t __b)
  {
 -  return __builtin_aarch64_smin_nanpv2sf (a, b);
-+  return -__a;
++  return __builtin_aarch64_faddpv2sf (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vpminq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s16 (int16x4_t __a)
++vpaddq_f32 (float32x4_t __a, float32x4_t __b)
  {
 -  return __builtin_aarch64_smin_nanpv4sf (a, b);
-+  return -__a;
++  return __builtin_aarch64_faddpv4sf (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vpminq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s32 (int32x2_t __a)
++vpaddq_f64 (float64x2_t __a, float64x2_t __b)
  {
 -  return __builtin_aarch64_smin_nanpv2df (a, b);
-+  return -__a;
++  return __builtin_aarch64_faddpv2df (__a, __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vpminqd_f64 (float64x2_t a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s64 (int64x1_t __a)
++vpadd_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
-+  return -__a;
++  return __builtin_aarch64_addpv8qi (__a, __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vpmins_f32 (float32x2_t a)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_f32 (float32x4_t __a)
++vpadd_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
-+  return -__a;
++  return __builtin_aarch64_addpv4hi (__a, __b);
  }
  
 -/* vpminnm  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vpminnm_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_f64 (float64x2_t __a)
++vpadd_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  return __builtin_aarch64_sminpv2sf (a, b);
-+  return -__a;
++  return __builtin_aarch64_addpv2si (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vpminnmq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s8 (int8x16_t __a)
++vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
  {
 -  return __builtin_aarch64_sminpv4sf (a, b);
-+  return -__a;
++  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
++						 (int8x8_t) __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vpminnmq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s16 (int16x8_t __a)
++vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
  {
 -  return __builtin_aarch64_sminpv2df (a, b);
-+  return -__a;
++  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
++						  (int16x4_t) __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vpminnmqd_f64 (float64x2_t a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s32 (int32x4_t __a)
++vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v2df (a);
-+  return -__a;
++  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
++						  (int32x2_t) __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vpminnms_f32 (float32x2_t a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s64 (int64x2_t __a)
++vpadds_f32 (float32x2_t __a)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
-+  return -__a;
++  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
  }
  
 -/* vmaxnm  */
-+/* vpadd  */
- 
+-
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_f32 (float32x2_t __a, float32x2_t __b)
++vpaddd_f64 (float64x2_t __a)
  {
 -  return __builtin_aarch64_fmaxv2sf (__a, __b);
-+  return __builtin_aarch64_faddpv2sf (__a, __b);
++  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_f32 (float32x4_t __a, float32x4_t __b)
++vpaddd_s64 (int64x2_t __a)
  {
 -  return __builtin_aarch64_fmaxv4sf (__a, __b);
-+  return __builtin_aarch64_faddpv4sf (__a, __b);
++  return __builtin_aarch64_addpdi (__a);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_f64 (float64x2_t __a, float64x2_t __b)
++vpaddd_u64 (uint64x2_t __a)
  {
 -  return __builtin_aarch64_fmaxv2df (__a, __b);
-+  return __builtin_aarch64_faddpv2df (__a, __b);
++  return __builtin_aarch64_addpdi ((int64x2_t) __a);
  }
  
 -/* vmaxv  */
--
++/* vqabs */
+ 
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmaxv_f32 (float32x2_t __a)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_s8 (int8x8_t __a, int8x8_t __b)
++vqabsq_s64 (int64x2_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
-+  return __builtin_aarch64_addpv8qi (__a, __b);
++  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vmaxv_s8 (int8x8_t __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_s16 (int16x4_t __a, int16x4_t __b)
++vqabsb_s8 (int8_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
-+  return __builtin_aarch64_addpv4hi (__a, __b);
++  return (int8_t) __builtin_aarch64_sqabsqi (__a);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vmaxv_s16 (int16x4_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_s32 (int32x2_t __a, int32x2_t __b)
++vqabsh_s16 (int16_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
-+  return __builtin_aarch64_addpv2si (__a, __b);
++  return (int16_t) __builtin_aarch64_sqabshi (__a);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vmaxv_s32 (int32x2_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
++vqabss_s32 (int32_t __a)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
-+  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
-+						 (int8x8_t) __b);
++  return (int32_t) __builtin_aarch64_sqabssi (__a);
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vmaxv_u8 (uint8x8_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
++vqabsd_s64 (int64_t __a)
  {
 -  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
-+  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
-+						  (int16x4_t) __b);
++  return __builtin_aarch64_sqabsdi (__a);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vmaxv_u16 (uint16x4_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
+-{
 -  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
-+  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
-+						  (int32x2_t) __b);
- }
+-}
++/* vqadd */
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vmaxv_u32 (uint32x2_t __a)
-+__extension__ extern __inline float32_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadds_f32 (float32x2_t __a)
++vqaddb_s8 (int8_t __a, int8_t __b)
  {
 -  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
-+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
++  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmaxvq_f32 (float32x4_t __a)
-+__extension__ extern __inline float64_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddd_f64 (float64x2_t __a)
++vqaddh_s16 (int16_t __a, int16_t __b)
  {
 -  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
-+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
++  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmaxvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddd_s64 (int64x2_t __a)
++vqadds_s32 (int32_t __a, int32_t __b)
  {
 -  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
-+  return __builtin_aarch64_addpdi (__a);
++  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vmaxvq_s8 (int8x16_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddd_u64 (uint64x2_t __a)
++vqaddd_s64 (int64_t __a, int64_t __b)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
-+  return __builtin_aarch64_addpdi ((int64x2_t) __a);
++  return __builtin_aarch64_sqadddi (__a, __b);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vmaxvq_s16 (int16x8_t __a)
-+/* vqabs */
-+
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsq_s64 (int64x2_t __a)
++vqaddb_u8 (uint8_t __a, uint8_t __b)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
-+  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
++  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vmaxvq_s32 (int32x4_t __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsb_s8 (int8_t __a)
++vqaddh_u16 (uint16_t __a, uint16_t __b)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
-+  return (int8_t) __builtin_aarch64_sqabsqi (__a);
++  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vmaxvq_u8 (uint8x16_t __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsh_s16 (int16_t __a)
++vqadds_u32 (uint32_t __a, uint32_t __b)
  {
 -  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
-+  return (int16_t) __builtin_aarch64_sqabshi (__a);
++  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vmaxvq_u16 (uint16x8_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabss_s32 (int32_t __a)
++vqaddd_u64 (uint64_t __a, uint64_t __b)
  {
 -  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
-+  return (int32_t) __builtin_aarch64_sqabssi (__a);
++  return __builtin_aarch64_uqadddi_uuu (__a, __b);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vmaxvq_u32 (uint32x4_t __a)
-+__extension__ extern __inline int64_t
++/* vqdmlal */
++
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsd_s64 (int64_t __a)
++vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
  {
 -  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
-+  return __builtin_aarch64_sqabsdi (__a);
++  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
  }
  
 -/* vmaxnmv  */
-+/* vqadd */
- 
+-
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmaxnmv_f32 (float32x2_t __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddb_s8 (int8_t __a, int8_t __b)
++vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
-+  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
++  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmaxnmvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddh_s16 (int16_t __a, int16_t __b)
++vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
++		       int const __d)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
-+  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
++  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmaxnmvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqadds_s32 (int32_t __a, int32_t __b)
++vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
++			int const __d)
  {
 -  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
-+  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
++  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
  }
  
 -/* vmin  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmin_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddd_s64 (int64_t __a, int64_t __b)
++vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
  {
 -  return __builtin_aarch64_smin_nanv2sf (__a, __b);
-+  return __builtin_aarch64_sqadddi (__a, __b);
++  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vmin_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddb_u8 (uint8_t __a, uint8_t __b)
++vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
  {
 -  return __builtin_aarch64_sminv8qi (__a, __b);
-+  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
++  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmin_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddh_u16 (uint16_t __a, uint16_t __b)
++vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
  {
 -  return __builtin_aarch64_sminv4hi (__a, __b);
-+  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
++  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmin_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqadds_u32 (uint32_t __a, uint32_t __b)
++vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
  {
 -  return __builtin_aarch64_sminv2si (__a, __b);
-+  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
++  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vmin_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddd_u64 (uint64_t __a, uint64_t __b)
++vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
  {
 -  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
 -						 (int8x8_t) __b);
-+  return __builtin_aarch64_uqadddi_uuu (__a, __b);
++  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmin_u16 (uint16x4_t __a, uint16x4_t __b)
--{
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
+ {
 -  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
 -						  (int16x4_t) __b);
--}
-+/* vqdmlal */
++  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
+ }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmin_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
++vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
++		       int const __d)
  {
 -  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
 -						  (int32x2_t) __b);
-+  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vminq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
++vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
++			int const __d)
  {
 -  return __builtin_aarch64_smin_nanv4sf (__a, __b);
-+  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vminq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
-+		       int const __d)
++vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
  {
 -  return __builtin_aarch64_smin_nanv2df (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vminq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
-+			int const __d)
++vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
  {
 -  return __builtin_aarch64_sminv16qi (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vminq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
++vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
  {
 -  return __builtin_aarch64_sminv8hi (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vminq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
++vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
  {
 -  return __builtin_aarch64_sminv4si (__a, __b);
-+  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vminq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
++vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
  {
 -  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
 -						   (int8x16_t) __b);
-+  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vminq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
++vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
  {
 -  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
 -						  (int16x8_t) __b);
-+  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vminq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
++vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
  {
 -  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
 -						  (int32x4_t) __b);
-+  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
++  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
  }
  
 -/* vminnm  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vminnm_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
++vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
  {
 -  return __builtin_aarch64_fminv2sf (__a, __b);
-+  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
++  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vminnmq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
-+		       int const __d)
++vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
  {
 -  return __builtin_aarch64_fminv4sf (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vminnmq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
-+			int const __d)
++vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
  {
 -  return __builtin_aarch64_fminv2df (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
  }
  
 -/* vminv  */
--
++/* vqdmlsl */
+ 
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vminv_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
++vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
-+  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
++  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vminv_s8 (int8x8_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
++vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
-+  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vminv_s16 (int16x4_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
++vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
++		       int const __d)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
-+  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vminv_s32 (int32x2_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
++vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
++			int const __d)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
-+  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vminv_u8 (uint8x8_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
++vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
  {
 -  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
-+  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vminv_u16 (uint16x4_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
++vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
  {
 -  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
-+  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vminv_u32 (uint32x2_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
++vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
  {
 -  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
-+  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vminvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
++vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
-+  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vminvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
++vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
-+  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vminvq_s8 (int8x16_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
++vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
-+  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vminvq_s16 (int16x8_t __a)
-+/* vqdmlsl */
-+
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
++vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
++		       int const __d)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
-+  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vminvq_s32 (int32x4_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
++vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
++			int const __d)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
-+  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vminvq_u8 (uint8x16_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
-+		       int const __d)
++vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
  {
 -  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
-+  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vminvq_u16 (uint16x8_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
-+			int const __d)
++vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
  {
 -  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
-+  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vminvq_u32 (uint32x4_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
++vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
  {
 -  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
-+  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
  }
  
 -/* vminnmv  */
 -
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vminnmv_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
++vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
-+  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vminnmvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
++vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
-+  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vminnmvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
++vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
  {
 -  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
-+  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
  }
  
 -/* vmla */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
++vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
  {
 -  return a + b * c;
-+  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
++  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
++vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
  {
 -  return __a + __b * __c;
-+  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
++  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
-+		       int const __d)
++vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
  {
 -  return a + b * c;
-+  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
-+			int const __d)
++vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
  {
 -  return a + b * c;
-+  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
  }
  
 -/* vmla_lane  */
--
++/* vqdmulh */
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
 -	       float32x2_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
++vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
++  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
 -		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
++vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
 -		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
++vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
 -		uint16x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
++vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
++  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
 -	       uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
++vqdmulhh_s16 (int16_t __a, int16_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
++  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
  }
  
 -/* vmla_laneq  */
--
++__extension__ extern __inline int16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
++{
++  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
++}
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
 -	        float32x4_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
++vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
@@ -35504,79 +37374,86 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		int16x8_t __c, const int __lane)
 +__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
++vqdmulhs_s32 (int32_t __a, int32_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
++  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
 -		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
++vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
++  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
 -		uint16x8_t __c, const int __lane)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
++vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
 -		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline int64_t
++/* vqdmull */
++
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
++vqdmull_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
++  return __builtin_aarch64_sqdmullv4hi (__a, __b);
  }
  
 -/* vmlaq_lane  */
-+/* vqdmulh */
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
++{
++  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
++}
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
 -		float32x2_t __c, const int __lane)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
++vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
 -		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
++vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
++  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
 -		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
++vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
@@ -35584,121 +37461,126 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		uint16x4_t __c, const int __lane)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
++vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
++  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
 -		uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhh_s16 (int16_t __a, int16_t __b)
++vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
++  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
  }
  
 -  /* vmlaq_laneq  */
--
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqdmull_n_s16 (int16x4_t __a, int16_t __b)
++{
++  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
++}
+ 
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
 -		 float32x4_t __c, const int __lane)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
++vqdmull_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
++  return __builtin_aarch64_sqdmullv2si (__a, __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
 -		int16x8_t __c, const int __lane)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
++vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
++  return __builtin_aarch64_sqdmull2v4si (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
 -		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhs_s32 (int32_t __a, int32_t __b)
++vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
++  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
 -		uint16x8_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
++vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
++  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
 -		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
++vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
  {
 -  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
++  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
  }
  
 -/* vmls  */
-+/* vqdmull */
- 
+-
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_s16 (int16x4_t __a, int16x4_t __b)
++vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
  {
 -  return a - b * c;
-+  return __builtin_aarch64_sqdmullv4hi (__a, __b);
++  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
++vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
  {
 -  return __a - __b * __c;
-+  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
++  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
++vqdmull_n_s32 (int32x2_t __a, int32_t __b)
  {
 -  return a - b * c;
-+  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
++  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
++vqdmullh_s16 (int16_t __a, int16_t __b)
  {
 -  return a - b * c;
-+  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
++  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
  }
  
 -/* vmls_lane  */
@@ -35706,113 +37588,114 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
 -	       float32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
++vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
++  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
 -		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
++vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
 -		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
++vqdmulls_s32 (int32_t __a, int32_t __b)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
++  return __builtin_aarch64_sqdmullsi (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
 -		uint16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_n_s16 (int16x4_t __a, int16_t __b)
++vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
++  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
 -	       uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_s32 (int32x2_t __a, int32x2_t __b)
++vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmullv2si (__a, __b);
++  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
  }
  
 -/* vmls_laneq  */
--
++/* vqmovn */
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
 -	       float32x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
++vqmovn_s16 (int16x8_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2v4si (__a, __b);
++  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
 -		int16x8_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
++vqmovn_s32 (int32x4_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
++  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
 -		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
++vqmovn_s64 (int64x2_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
++  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
 -		uint16x8_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
++vqmovn_u16 (uint16x8_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
++  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
 -		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
++vqmovn_u32 (uint32x4_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
++  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
  }
  
 -/* vmlsq_lane  */
@@ -35820,34 +37703,34 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
 -		float32x2_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
++vqmovn_u64 (uint64x2_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
++  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
 -		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_n_s32 (int32x2_t __a, int32_t __b)
++vqmovnh_s16 (int16_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
++  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
 -		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmullh_s16 (int16_t __a, int16_t __b)
++vqmovns_s32 (int32_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
++  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
@@ -35855,21 +37738,21 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -		uint16x4_t __c, const int __lane)
 +__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
++vqmovnd_s64 (int64_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
++  return (int32_t) __builtin_aarch64_sqmovndi (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
 -		uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
++vqmovnh_u16 (uint16_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
++  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
  }
  
 -  /* vmlsq_laneq  */
@@ -35877,1795 +37760,1666 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
 -		float32x4_t __c, const int __lane)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulls_s32 (int32_t __a, int32_t __b)
++vqmovns_u32 (uint32_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmullsi (__a, __b);
++  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
 -		int16x8_t __c, const int __lane)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
++vqmovnd_u64 (uint64_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
++  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
 -		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int64_t
++/* vqmovun */
++
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
++vqmovun_s16 (int16x8_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
++  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
  }
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
 -		uint16x8_t __c, const int __lane)
 +
-+/* vqmovn */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_s16 (int16x8_t __a)
++vqmovun_s32 (int32x4_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
++  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
 -		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_s32 (int32x4_t __a)
++vqmovun_s64 (int64x2_t __a)
  {
 -  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
++  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
  }
  
 -/* vmov_n_  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmov_n_f32 (float32_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_s64 (int64x2_t __a)
++vqmovunh_s16 (int16_t __a)
  {
 -  return vdup_n_f32 (__a);
-+  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
++  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmov_n_f64 (float64_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_u16 (uint16x8_t __a)
++vqmovuns_s32 (int32_t __a)
  {
 -  return (float64x1_t) {__a};
-+  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
++  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vmov_n_p8 (poly8_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_u32 (uint32x4_t __a)
++vqmovund_s64 (int64_t __a)
  {
 -  return vdup_n_p8 (__a);
-+  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
++  return (int32_t) __builtin_aarch64_sqmovundi (__a);
  }
  
 -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 -vmov_n_p16 (poly16_t __a)
-+__extension__ extern __inline uint32x2_t
++/* vqneg */
++
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_u64 (uint64x2_t __a)
++vqnegq_s64 (int64x2_t __a)
  {
 -  return vdup_n_p16 (__a);
-+  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
++  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vmov_n_s8 (int8_t __a)
 +__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnh_s16 (int16_t __a)
++vqnegb_s8 (int8_t __a)
  {
 -  return vdup_n_s8 (__a);
-+  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
++  return (int8_t) __builtin_aarch64_sqnegqi (__a);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmov_n_s16 (int16_t __a)
 +__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovns_s32 (int32_t __a)
++vqnegh_s16 (int16_t __a)
  {
 -  return vdup_n_s16 (__a);
-+  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
++  return (int16_t) __builtin_aarch64_sqneghi (__a);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmov_n_s32 (int32_t __a)
 +__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnd_s64 (int64_t __a)
++vqnegs_s32 (int32_t __a)
  {
 -  return vdup_n_s32 (__a);
-+  return (int32_t) __builtin_aarch64_sqmovndi (__a);
++  return (int32_t) __builtin_aarch64_sqnegsi (__a);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vmov_n_s64 (int64_t __a)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnh_u16 (uint16_t __a)
++vqnegd_s64 (int64_t __a)
  {
 -  return (int64x1_t) {__a};
-+  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
++  return __builtin_aarch64_sqnegdi (__a);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vmov_n_u8 (uint8_t __a)
-+__extension__ extern __inline uint16_t
++/* vqrdmulh */
++
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovns_u32 (uint32_t __a)
++vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
  {
 -  return vdup_n_u8 (__a);
-+  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
++  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmov_n_u16 (uint16_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnd_u64 (uint64_t __a)
++vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
  {
 -    return vdup_n_u16 (__a);
-+  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
++  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmov_n_u32 (uint32_t __a)
-+/* vqmovun */
-+
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_s16 (int16x8_t __a)
++vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
  {
 -   return vdup_n_u32 (__a);
-+  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
++  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vmov_n_u64 (uint64_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_s32 (int32x4_t __a)
++vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
  {
 -  return (uint64x1_t) {__a};
-+  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
++  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmovq_n_f32 (float32_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_s64 (int64x2_t __a)
++vqrdmulhh_s16 (int16_t __a, int16_t __b)
  {
 -  return vdupq_n_f32 (__a);
-+  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
++  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmovq_n_f64 (float64_t __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovunh_s16 (int16_t __a)
++vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
  {
 -  return vdupq_n_f64 (__a);
-+  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
++  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vmovq_n_p8 (poly8_t __a)
 +__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovuns_s32 (int32_t __a)
++vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
  {
 -  return vdupq_n_p8 (__a);
-+  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
++  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
  }
  
 -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 -vmovq_n_p16 (poly16_t __a)
 +__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovund_s64 (int64_t __a)
++vqrdmulhs_s32 (int32_t __a, int32_t __b)
  {
 -  return vdupq_n_p16 (__a);
-+  return (int32_t) __builtin_aarch64_sqmovundi (__a);
++  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vmovq_n_s8 (int8_t __a)
-+/* vqneg */
-+
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegq_s64 (int64x2_t __a)
++vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
  {
 -  return vdupq_n_s8 (__a);
-+  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
++  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmovq_n_s16 (int16_t __a)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegb_s8 (int8_t __a)
++vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
  {
 -  return vdupq_n_s16 (__a);
-+  return (int8_t) __builtin_aarch64_sqnegqi (__a);
++  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmovq_n_s32 (int32_t __a)
-+__extension__ extern __inline int16_t
++/* vqrshl */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegh_s16 (int16_t __a)
++vqrshl_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  return vdupq_n_s32 (__a);
-+  return (int16_t) __builtin_aarch64_sqneghi (__a);
++  return __builtin_aarch64_sqrshlv8qi (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vmovq_n_s64 (int64_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegs_s32 (int32_t __a)
++vqrshl_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  return vdupq_n_s64 (__a);
-+  return (int32_t) __builtin_aarch64_sqnegsi (__a);
++  return __builtin_aarch64_sqrshlv4hi (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vmovq_n_u8 (uint8_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegd_s64 (int64_t __a)
++vqrshl_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  return vdupq_n_u8 (__a);
-+  return __builtin_aarch64_sqnegdi (__a);
++  return __builtin_aarch64_sqrshlv2si (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmovq_n_u16 (uint16_t __a)
-+/* vqrdmulh */
-+
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
++vqrshl_s64 (int64x1_t __a, int64x1_t __b)
  {
 -  return vdupq_n_u16 (__a);
-+  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
++  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmovq_n_u32 (uint32_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
++vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
  {
 -  return vdupq_n_u32 (__a);
-+  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
++  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vmovq_n_u64 (uint64_t __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
++vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
  {
 -  return vdupq_n_u64 (__a);
-+  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
++  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
  }
  
 -/* vmul_lane  */
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
++vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
 +{
-+  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
++  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
 +}
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhh_s16 (int16_t __a, int16_t __b)
++vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
++vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  return __a * __b;
-+  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
++  return __builtin_aarch64_sqrshlv16qi (__a, __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
++vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
++  return __builtin_aarch64_sqrshlv8hi (__a, __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhs_s32 (int32_t __a, int32_t __b)
++vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
++  return __builtin_aarch64_sqrshlv4si (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
++vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
++  return __builtin_aarch64_sqrshlv2di (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
++vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
++  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
  }
  
 -/* vmuld_lane  */
-+/* vqrshl */
- 
+-
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s8 (int8x8_t __a, int8x8_t __b)
++vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv8qi (__a, __b);
++  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s16 (int16x4_t __a, int16x4_t __b)
++vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv4hi (__a, __b);
++  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
  }
  
 -/* vmuls_lane  */
 -
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s32 (int32x2_t __a, int32x2_t __b)
++vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv2si (__a, __b);
++  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s64 (int64x1_t __a, int64x1_t __b)
++vqrshlb_s8 (int8_t __a, int8_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
++  return __builtin_aarch64_sqrshlqi (__a, __b);
  }
  
 -/* vmul_laneq  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
++vqrshlh_s16 (int16_t __a, int16_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
++  return __builtin_aarch64_sqrshlhi (__a, __b);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
++vqrshls_s32 (int32_t __a, int32_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
++  return __builtin_aarch64_sqrshlsi (__a, __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
++vqrshld_s64 (int64_t __a, int64_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
++  return __builtin_aarch64_sqrshldi (__a, __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
++vqrshlb_u8 (uint8_t __a, uint8_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
++  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
++vqrshlh_u16 (uint16_t __a, uint16_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv16qi (__a, __b);
++  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
++vqrshls_u32 (uint32_t __a, uint32_t __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv8hi (__a, __b);
++  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
  }
  
 -/* vmul_n  */
 -
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vmul_n_f64  (float64x1_t __a, float64_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
++vqrshld_u64 (uint64_t __a, uint64_t __b)
  {
 -  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
-+  return __builtin_aarch64_sqrshlv4si (__a, __b);
++  return __builtin_aarch64_uqrshldi_uus (__a, __b);
  }
  
 -/* vmulq_lane  */
--
++/* vqrshrn */
+ 
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
++vqrshrn_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv2di (__a, __b);
++  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
++vqrshrn_n_s32 (int32x4_t __a, const int __b)
  {
 -  __AARCH64_LANE_CHECK (__a, __lane);
 -  return __a * __b[0];
-+  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
++  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
++vqrshrn_n_s64 (int64x2_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
++  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
++vqrshrn_n_u16 (uint16x8_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
++  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
++vqrshrn_n_u32 (uint32x4_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
++  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlb_s8 (int8_t __a, int8_t __b)
++vqrshrn_n_u64 (uint64x2_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlqi (__a, __b);
++  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
  }
  
 -/* vmulq_laneq  */
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlh_s16 (int16_t __a, int16_t __b)
-+{
-+  return __builtin_aarch64_sqrshlhi (__a, __b);
-+}
- 
+-
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshls_s32 (int32_t __a, int32_t __b)
++vqrshrnh_n_s16 (int16_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlsi (__a, __b);
++  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshld_s64 (int64_t __a, int64_t __b)
++vqrshrns_n_s32 (int32_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshldi (__a, __b);
++  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlb_u8 (uint8_t __a, uint8_t __b)
++vqrshrnd_n_s64 (int64_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
++  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlh_u16 (uint16_t __a, uint16_t __b)
++vqrshrnh_n_u16 (uint16_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
++  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshls_u32 (uint32_t __a, uint32_t __b)
++vqrshrns_n_u32 (uint32_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
++  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshld_u64 (uint64_t __a, uint64_t __b)
++vqrshrnd_n_u64 (uint64_t __a, const int __b)
  {
 -  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshldi_uus (__a, __b);
++  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
  }
  
 -/* vneg  */
-+/* vqrshrn */
++/* vqrshrun */
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vneg_f32 (float32x2_t __a)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_s16 (int16x8_t __a, const int __b)
++vqrshrun_n_s16 (int16x8_t __a, const int __b)
  {
 -  return -__a;
-+  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
++  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vneg_f64 (float64x1_t __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_s32 (int32x4_t __a, const int __b)
++vqrshrun_n_s32 (int32x4_t __a, const int __b)
  {
 -  return -__a;
-+  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
++  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vneg_s8 (int8x8_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_s64 (int64x2_t __a, const int __b)
++vqrshrun_n_s64 (int64x2_t __a, const int __b)
  {
 -  return -__a;
-+  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
++  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vneg_s16 (int16x4_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_u16 (uint16x8_t __a, const int __b)
++vqrshrunh_n_s16 (int16_t __a, const int __b)
  {
 -  return -__a;
-+  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
++  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vneg_s32 (int32x2_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_u32 (uint32x4_t __a, const int __b)
++vqrshruns_n_s32 (int32_t __a, const int __b)
  {
 -  return -__a;
-+  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
++  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vneg_s64 (int64x1_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_u64 (uint64x2_t __a, const int __b)
++vqrshrund_n_s64 (int64_t __a, const int __b)
  {
 -  return -__a;
-+  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
++  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vnegq_f32 (float32x4_t __a)
-+__extension__ extern __inline int8_t
++/* vqshl */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnh_n_s16 (int16_t __a, const int __b)
++vqshl_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  return -__a;
-+  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
++  return __builtin_aarch64_sqshlv8qi (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vnegq_f64 (float64x2_t __a)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrns_n_s32 (int32_t __a, const int __b)
++vqshl_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  return -__a;
-+  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
++  return __builtin_aarch64_sqshlv4hi (__a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vnegq_s8 (int8x16_t __a)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnd_n_s64 (int64_t __a, const int __b)
++vqshl_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  return -__a;
-+  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
++  return __builtin_aarch64_sqshlv2si (__a, __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vnegq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnh_n_u16 (uint16_t __a, const int __b)
++vqshl_s64 (int64x1_t __a, int64x1_t __b)
  {
 -  return -__a;
-+  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
++  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vnegq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrns_n_u32 (uint32_t __a, const int __b)
++vqshl_u8 (uint8x8_t __a, int8x8_t __b)
  {
 -  return -__a;
-+  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
++  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vnegq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnd_n_u64 (uint64_t __a, const int __b)
++vqshl_u16 (uint16x4_t __a, int16x4_t __b)
  {
 -  return -__a;
-+  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
++  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
  }
  
 -/* vpadd  */
-+/* vqrshrun */
- 
+-
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vpadd_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrun_n_s16 (int16x8_t __a, const int __b)
++vqshl_u32 (uint32x2_t __a, int32x2_t __b)
  {
 -  return __builtin_aarch64_addpv8qi (__a, __b);
-+  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
++  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vpadd_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrun_n_s32 (int32x4_t __a, const int __b)
++vqshl_u64 (uint64x1_t __a, int64x1_t __b)
  {
 -  return __builtin_aarch64_addpv4hi (__a, __b);
-+  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vpadd_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrun_n_s64 (int64x2_t __a, const int __b)
++vqshlq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  return __builtin_aarch64_addpv2si (__a, __b);
-+  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
++  return __builtin_aarch64_sqshlv16qi (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrunh_n_s16 (int16_t __a, const int __b)
++vqshlq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
 -						 (int8x8_t) __b);
-+  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
++  return __builtin_aarch64_sqshlv8hi (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshruns_n_s32 (int32_t __a, const int __b)
++vqshlq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
 -						  (int16x4_t) __b);
-+  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
++  return __builtin_aarch64_sqshlv4si (__a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrund_n_s64 (int64_t __a, const int __b)
++vqshlq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
 -						  (int32x2_t) __b);
-+  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
++  return __builtin_aarch64_sqshlv2di (__a, __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vpaddd_f64 (float64x2_t __a)
--{
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
+ {
 -  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
--}
-+/* vqshl */
++  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
+ }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vpaddd_s64 (int64x2_t __a)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s8 (int8x8_t __a, int8x8_t __b)
++vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
  {
 -  return __builtin_aarch64_addpdi (__a);
-+  return __builtin_aarch64_sqshlv8qi (__a, __b);
++  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vpaddd_u64 (uint64x2_t __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s16 (int16x4_t __a, int16x4_t __b)
++vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
  {
 -  return __builtin_aarch64_addpdi ((int64x2_t) __a);
-+  return __builtin_aarch64_sqshlv4hi (__a, __b);
++  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
  }
  
 -/* vqabs */
 -
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqabsq_s64 (int64x2_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s32 (int32x2_t __a, int32x2_t __b)
++vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
  {
 -  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
-+  return __builtin_aarch64_sqshlv2si (__a, __b);
++  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vqabsb_s8 (int8_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s64 (int64x1_t __a, int64x1_t __b)
++vqshlb_s8 (int8_t __a, int8_t __b)
  {
 -  return (int8_t) __builtin_aarch64_sqabsqi (__a);
-+  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
++  return __builtin_aarch64_sqshlqi (__a, __b);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqabsh_s16 (int16_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u8 (uint8x8_t __a, int8x8_t __b)
++vqshlh_s16 (int16_t __a, int16_t __b)
  {
 -  return (int16_t) __builtin_aarch64_sqabshi (__a);
-+  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
++  return __builtin_aarch64_sqshlhi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqabss_s32 (int32_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u16 (uint16x4_t __a, int16x4_t __b)
++vqshls_s32 (int32_t __a, int32_t __b)
  {
 -  return (int32_t) __builtin_aarch64_sqabssi (__a);
-+  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
++  return __builtin_aarch64_sqshlsi (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqabsd_s64 (int64_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u32 (uint32x2_t __a, int32x2_t __b)
++vqshld_s64 (int64_t __a, int64_t __b)
  {
 -  return __builtin_aarch64_sqabsdi (__a);
-+  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
++  return __builtin_aarch64_sqshldi (__a, __b);
  }
  
 -/* vqadd */
 -
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
 -vqaddb_s8 (int8_t __a, int8_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u64 (uint64x1_t __a, int64x1_t __b)
++vqshlb_u8 (uint8_t __a, uint8_t __b)
  {
 -  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
++  return __builtin_aarch64_uqshlqi_uus (__a, __b);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqaddh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s8 (int8x16_t __a, int8x16_t __b)
++vqshlh_u16 (uint16_t __a, uint16_t __b)
  {
 -  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
-+  return __builtin_aarch64_sqshlv16qi (__a, __b);
++  return __builtin_aarch64_uqshlhi_uus (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqadds_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s16 (int16x8_t __a, int16x8_t __b)
++vqshls_u32 (uint32_t __a, uint32_t __b)
  {
 -  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
-+  return __builtin_aarch64_sqshlv8hi (__a, __b);
++  return __builtin_aarch64_uqshlsi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqaddd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s32 (int32x4_t __a, int32x4_t __b)
++vqshld_u64 (uint64_t __a, uint64_t __b)
  {
 -  return __builtin_aarch64_sqadddi (__a, __b);
-+  return __builtin_aarch64_sqshlv4si (__a, __b);
++  return __builtin_aarch64_uqshldi_uus (__a, __b);
  }
  
 -__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 -vqaddb_u8 (uint8_t __a, uint8_t __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s64 (int64x2_t __a, int64x2_t __b)
++vqshl_n_s8 (int8x8_t __a, const int __b)
  {
 -  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
-+  return __builtin_aarch64_sqshlv2di (__a, __b);
++  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
  }
  
 -__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 -vqaddh_u16 (uint16_t __a, uint16_t __b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
++vqshl_n_s16 (int16x4_t __a, const int __b)
  {
 -  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
-+  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
++  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vqadds_u32 (uint32_t __a, uint32_t __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
++vqshl_n_s32 (int32x2_t __a, const int __b)
  {
 -  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
-+  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
++  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vqaddd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
++vqshl_n_s64 (int64x1_t __a, const int __b)
  {
 -  return __builtin_aarch64_uqadddi_uuu (__a, __b);
-+  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
++  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
  }
  
 -/* vqdmlal */
 -
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
++vqshl_n_u8 (uint8x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
++  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_s8 (int8_t __a, int8_t __b)
++vqshl_n_u16 (uint16x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
-+  return __builtin_aarch64_sqshlqi (__a, __b);
++  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
 -		       int const __d)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_s16 (int16_t __a, int16_t __b)
++vqshl_n_u32 (uint32x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlhi (__a, __b);
++  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
 -			int const __d)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_s32 (int32_t __a, int32_t __b)
++vqshl_n_u64 (uint64x1_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlsi (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_s64 (int64_t __a, int64_t __b)
++vqshlq_n_s8 (int8x16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
-+  return __builtin_aarch64_sqshldi (__a, __b);
++  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_u8 (uint8_t __a, uint8_t __b)
++vqshlq_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshlqi_uus (__a, __b);
++  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_u16 (uint16_t __a, uint16_t __b)
++vqshlq_n_s32 (int32x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshlhi_uus (__a, __b);
++  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_u32 (uint32_t __a, uint32_t __b)
++vqshlq_n_s64 (int64x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshlsi_uus (__a, __b);
++  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_u64 (uint64_t __a, uint64_t __b)
++vqshlq_n_u8 (uint8x16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
-+  return __builtin_aarch64_uqshldi_uus (__a, __b);
++  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s8 (int8x8_t __a, const int __b)
++vqshlq_n_u16 (uint16x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
-+  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
++  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
 -		       int const __d)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s16 (int16x4_t __a, const int __b)
++vqshlq_n_u32 (uint32x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
-+  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
++  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
 -			int const __d)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s32 (int32x2_t __a, const int __b)
++vqshlq_n_u64 (uint64x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
-+  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
++  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s64 (int64x1_t __a, const int __b)
++vqshlb_n_s8 (int8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
-+  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
++  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u8 (uint8x8_t __a, const int __b)
++vqshlh_n_s16 (int16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
++  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u16 (uint16x4_t __a, const int __b)
++vqshls_n_s32 (int32_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
++  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u32 (uint32x2_t __a, const int __b)
++vqshld_n_s64 (int64_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
++  return __builtin_aarch64_sqshl_ndi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u64 (uint64x1_t __a, const int __b)
++vqshlb_n_u8 (uint8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
-+  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
++  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s8 (int8x16_t __a, const int __b)
++vqshlh_n_u16 (uint16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
-+  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
++  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s16 (int16x8_t __a, const int __b)
++vqshls_n_u32 (uint32_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
-+  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
++  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s32 (int32x4_t __a, const int __b)
++vqshld_n_u64 (uint64_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
-+  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
++  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int64x2_t
++/* vqshlu */
++
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s64 (int64x2_t __a, const int __b)
++vqshlu_n_s8 (int8x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
-+  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
++  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u8 (uint8x16_t __a, const int __b)
++vqshlu_n_s16 (int16x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
++  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
  }
  
 -/* vqdmlsl */
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u16 (uint16x8_t __a, const int __b)
-+{
-+  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
-+}
- 
+-
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u32 (uint32x4_t __a, const int __b)
++vqshlu_n_s32 (int32x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
++  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u64 (uint64x2_t __a, const int __b)
++vqshlu_n_s64 (int64x1_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
 -		       int const __d)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_n_s8 (int8_t __a, const int __b)
++vqshluq_n_s8 (int8x16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
-+  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
++  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
 -			int const __d)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_n_s16 (int16_t __a, const int __b)
++vqshluq_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
-+  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
++  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_n_s32 (int32_t __a, const int __b)
++vqshluq_n_s32 (int32x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
-+  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
++  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_n_s64 (int64_t __a, const int __b)
++vqshluq_n_s64 (int64x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshl_ndi (__a, __b);
++  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_n_u8 (uint8_t __a, const int __b)
++vqshlub_n_s8 (int8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
++  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_n_u16 (uint16_t __a, const int __b)
++vqshluh_n_s16 (int16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
++  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_n_u32 (uint32_t __a, const int __b)
++vqshlus_n_s32 (int32_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
++  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_n_u64 (uint64_t __a, const int __b)
++vqshlud_n_s64 (int64_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
++  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
 -		       int const __d)
-+/* vqshlu */
++/* vqshrn */
 +
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s8 (int8x8_t __a, const int __b)
++vqshrn_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
++  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
 -			int const __d)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s16 (int16x4_t __a, const int __b)
++vqshrn_n_s32 (int32x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
++  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s32 (int32x2_t __a, const int __b)
++vqshrn_n_s64 (int64x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
-+  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
++  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s64 (int64x1_t __a, const int __b)
++vqshrn_n_u16 (uint16x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
-+  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
++  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s8 (int8x16_t __a, const int __b)
++vqshrn_n_u32 (uint32x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
++  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s16 (int16x8_t __a, const int __b)
++vqshrn_n_u64 (uint64x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
-+  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
++  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s32 (int32x4_t __a, const int __b)
++vqshrnh_n_s16 (int16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
-+  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
++  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s64 (int64x2_t __a, const int __b)
++vqshrns_n_s32 (int32_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
++  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline int8_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlub_n_s8 (int8_t __a, const int __b)
++vqshrnd_n_s64 (int64_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
-+  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
++  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
-+__extension__ extern __inline int16_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluh_n_s16 (int16_t __a, const int __b)
++vqshrnh_n_u16 (uint16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
-+  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
++  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int32_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlus_n_s32 (int32_t __a, const int __b)
++vqshrns_n_u32 (uint32_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
-+  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
++  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlud_n_s64 (int64_t __a, const int __b)
++vqshrnd_n_u64 (uint64_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
++  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
  }
  
 -/* vqdmulh */
-+/* vqshrn */
++/* vqshrun */
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_s16 (int16x8_t __a, const int __b)
++vqshrun_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
-+  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
++  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_s32 (int32x4_t __a, const int __b)
++vqshrun_n_s32 (int32x4_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
-+  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
++  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_s64 (int64x2_t __a, const int __b)
++vqshrun_n_s64 (int64x2_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
-+  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
++  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_u16 (uint16x8_t __a, const int __b)
++vqshrunh_n_s16 (int16_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
-+  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
++  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqdmulhh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_u32 (uint32x4_t __a, const int __b)
++vqshruns_n_s32 (int32_t __a, const int __b)
  {
 -  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
-+  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
++  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_u64 (uint64x2_t __a, const int __b)
++vqshrund_n_s64 (int64_t __a, const int __b)
  {
 -  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
-+  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
++  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
 -vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
++/* vqsub */
++
 +__extension__ extern __inline int8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnh_n_s16 (int16_t __a, const int __b)
++vqsubb_s8 (int8_t __a, int8_t __b)
  {
 -  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
-+  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
++  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmulhs_s32 (int32_t __a, int32_t __b)
 +__extension__ extern __inline int16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrns_n_s32 (int32_t __a, const int __b)
++vqsubh_s16 (int16_t __a, int16_t __b)
  {
 -  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
-+  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
++  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 +__extension__ extern __inline int32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnd_n_s64 (int64_t __a, const int __b)
++vqsubs_s32 (int32_t __a, int32_t __b)
  {
 -  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
-+  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
++  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
 -vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnh_n_u16 (uint16_t __a, const int __b)
++vqsubd_s64 (int64_t __a, int64_t __b)
  {
 -  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
-+  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
++  return __builtin_aarch64_sqsubdi (__a, __b);
  }
  
 -/* vqdmull */
 -
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vqdmull_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrns_n_u32 (uint32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmullv4hi (__a, __b);
-+  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnd_n_u64 (uint64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
-+  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
-+/* vqshrun */
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrun_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
-+  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrun_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
-+  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrun_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
-+  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrunh_n_s16 (int16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
-+  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshruns_n_s32 (int32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
-+  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_n_s16 (int16x4_t __a, int16_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrund_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
-+  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_s32 (int32x2_t __a, int32x2_t __b)
-+/* vqsub */
-+
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubb_s8 (int8_t __a, int8_t __b)
- {
--  return __builtin_aarch64_sqdmullv2si (__a, __b);
-+  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubh_s16 (int16_t __a, int16_t __b)
- {
--  return __builtin_aarch64_sqdmull2v4si (__a, __b);
-+  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubs_s32 (int32_t __a, int32_t __b)
- {
--  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
-+  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubd_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
-+  return __builtin_aarch64_sqsubdi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
 +__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqsubb_u8 (uint8_t __a, uint8_t __b)
  {
--  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
+-  return __builtin_aarch64_sqdmullv4hi (__a, __b);
 +  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqsubh_u16 (uint16_t __a, uint16_t __b)
  {
--  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
+-  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
 +  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
 +__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqsubs_u32 (uint32_t __a, uint32_t __b)
  {
--  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
+-  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
 +  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_n_s32 (int32x2_t __a, int32_t __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqsubd_u64 (uint64_t __a, uint64_t __b)
  {
--  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
+-  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
 +  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmullh_s16 (int16_t __a, int16_t __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
 +/* vqtbl2 */
 +
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
  {
--  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
+-  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
 +  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
  {
--  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
+-  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 +  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
  {
--  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
+-  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 +  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmulls_s32 (int32_t __a, int32_t __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqdmull_n_s16 (int16x4_t __a, int16_t __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
  {
--  return __builtin_aarch64_sqdmullsi (__a, __b);
+-  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 +  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_s32 (int32x2_t __a, int32x2_t __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
  {
--  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
+-  return __builtin_aarch64_sqdmullv2si (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 +  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
  {
--  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
+-  return __builtin_aarch64_sqdmull2v4si (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 +  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
  }
  
--/* vqmovn */
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
 +/* vqtbl3 */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqmovn_s16 (int16x8_t __a)
++
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
  {
--  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
+-  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37673,13 +39427,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqmovn_s32 (int32x4_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
  {
--  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
+-  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37687,13 +39441,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqmovn_s64 (int64x2_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
  {
--  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
+-  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37701,13 +39455,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqmovn_u16 (uint16x8_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
  {
--  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
+-  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37715,13 +39469,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqmovn_u32 (uint32x4_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
  {
--  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
+-  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37729,13 +39483,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqmovn_u64 (uint64x2_t __a)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqdmull_n_s32 (int32x2_t __a, int32_t __b)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
  {
--  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
+-  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37743,15 +39497,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqmovnh_s16 (int16_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqdmullh_s16 (int16_t __a, int16_t __b)
 +/* vqtbl4 */
 +
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
  {
--  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
+-  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37760,13 +39514,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqmovns_s32 (int32_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
  {
--  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
+-  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37776,12 +39530,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqmovnd_s64 (int64_t __a)
+-vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
  {
--  return (int32_t) __builtin_aarch64_sqmovndi (__a);
+-  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37790,13 +39544,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqmovnh_u16 (uint16_t __a)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqdmulls_s32 (int32_t __a, int32_t __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
  {
--  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
+-  return __builtin_aarch64_sqdmullsi (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37805,13 +39559,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqmovns_u32 (uint32_t __a)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
  {
--  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
+-  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37820,13 +39574,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqmovnd_u64 (uint64_t __a)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
  {
--  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
+-  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37835,29 +39589,29 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
  }
  
--/* vqmovun */
+-/* vqmovn */
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqmovun_s16 (int16x8_t __a)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqmovn_s16 (int16x8_t __a)
 +/* vqtbx2 */
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
  {
--  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
+-  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
 +  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqmovun_s32 (int32x4_t __a)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqmovn_s32 (int32x4_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
  {
--  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
+-  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37865,13 +39619,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						(int8x8_t)idx);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqmovun_s64 (int64x2_t __a)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqmovn_s64 (int64x2_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
  {
--  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
+-  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37879,26 +39633,26 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						(int8x8_t)idx);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqmovunh_s16 (int16_t __a)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqmovn_u16 (uint16x8_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
  {
--  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
+-  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
 +  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqmovuns_s32 (int32_t __a)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqmovn_u32 (uint32x4_t __a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
  {
--  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
+-  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37906,13 +39660,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						  (int8x16_t)idx);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqmovund_s64 (int64_t __a)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqmovn_u64 (uint64x2_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
  {
--  return (int32_t) __builtin_aarch64_sqmovundi (__a);
+-  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37920,26 +39674,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						  (int8x16_t)idx);
  }
  
--/* vqneg */
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqmovnh_s16 (int16_t __a)
 +/* vqtbx3 */
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
-+{
+ {
+-  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
 +  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
-+}
+ }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqnegq_s64 (int64x2_t __a)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqmovns_s32 (int32_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
  {
--  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
+-  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37948,13 +39704,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						 (int8x8_t)idx);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqnegb_s8 (int8_t __a)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqmovnd_s64 (int64_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
  {
--  return (int8_t) __builtin_aarch64_sqnegqi (__a);
+-  return (int32_t) __builtin_aarch64_sqmovndi (__a);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37963,13 +39719,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						 (int8x8_t)idx);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqnegh_s16 (int16_t __a)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqmovnh_u16 (uint16_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
  {
--  return (int16_t) __builtin_aarch64_sqneghi (__a);
+-  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
@@ -37977,13 +39733,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqnegs_s32 (int32_t __a)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqmovns_u32 (uint32_t __a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
  {
--  return (int32_t) __builtin_aarch64_sqnegsi (__a);
+-  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -37992,13 +39748,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						   (int8x16_t)idx);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqnegd_s64 (int64_t __a)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqmovnd_u64 (uint64_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
  {
--  return __builtin_aarch64_sqnegdi (__a);
+-  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -38007,16 +39763,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						   (int8x16_t)idx);
  }
  
--/* vqrdmulh */
+-/* vqmovun */
 +/* vqtbx4 */
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqmovun_s16 (int16x8_t __a)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
  {
--  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
+-  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
@@ -38025,13 +39781,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqmovun_s32 (int32x4_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
  {
--  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
+-  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -38041,13 +39797,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						 (int8x8_t)idx);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqmovun_s64 (int64x2_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
  {
--  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
+-  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -38057,13 +39813,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						 (int8x8_t)idx);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqmovunh_s16 (int16_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
  {
--  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
+-  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
@@ -38073,12 +39829,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmulhh_s16 (int16_t __a, int16_t __b)
+-vqmovuns_s32 (int32_t __a)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
  {
--  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
+-  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -38088,13 +39844,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						   (int8x16_t)idx);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqmovund_s64 (int64_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
  {
--  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
+-  return (int32_t) __builtin_aarch64_sqmovundi (__a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
@@ -38104,254 +39860,242 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +						   (int8x16_t)idx);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
--{
--  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
--}
+-/* vqneg */
 +/* vrbit  */
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmulhs_s32 (int32_t __a, int32_t __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqnegq_s64 (int64x2_t __a)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrbit_p8 (poly8x8_t __a)
  {
--  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
+-  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
 +  return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqnegb_s8 (int8_t __a)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrbit_s8 (int8x8_t __a)
  {
--  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
+-  return (int8_t) __builtin_aarch64_sqnegqi (__a);
 +  return __builtin_aarch64_rbitv8qi (__a);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqnegh_s16 (int16_t __a)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrbit_u8 (uint8x8_t __a)
  {
--  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
+-  return (int16_t) __builtin_aarch64_sqneghi (__a);
 +  return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
  }
  
--/* vqrshl */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqrshl_s8 (int8x8_t __a, int8x8_t __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqnegs_s32 (int32_t __a)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrbitq_p8 (poly8x16_t __a)
  {
--  return __builtin_aarch64_sqrshlv8qi (__a, __b);
+-  return (int32_t) __builtin_aarch64_sqnegsi (__a);
 +  return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrshl_s16 (int16x4_t __a, int16x4_t __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqnegd_s64 (int64_t __a)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrbitq_s8 (int8x16_t __a)
  {
--  return __builtin_aarch64_sqrshlv4hi (__a, __b);
+-  return __builtin_aarch64_sqnegdi (__a);
 +  return __builtin_aarch64_rbitv16qi (__a);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrshl_s32 (int32x2_t __a, int32x2_t __b)
+-/* vqrdmulh */
+-
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrbitq_u8 (uint8x16_t __a)
  {
--  return __builtin_aarch64_sqrshlv2si (__a, __b);
+-  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
 +  return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vqrshl_s64 (int64x1_t __a, int64x1_t __b)
--{
--  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
--}
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 +/* vrecpe  */
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
++
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpe_u32 (uint32x2_t __a)
  {
--  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
+-  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
 +  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpeq_u32 (uint32x4_t __a)
  {
--  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
+-  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
 +  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpes_f32 (float32_t __a)
  {
--  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
+-  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
 +  return __builtin_aarch64_frecpesf (__a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqrdmulhh_s16 (int16_t __a, int16_t __b)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecped_f64 (float64_t __a)
  {
--  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
+-  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
 +  return __builtin_aarch64_frecpedf (__a);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpe_f32 (float32x2_t __a)
  {
--  return __builtin_aarch64_sqrshlv16qi (__a, __b);
+-  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
 +  return __builtin_aarch64_frecpev2sf (__a);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpe_f64 (float64x1_t __a)
  {
--  return __builtin_aarch64_sqrshlv8hi (__a, __b);
+-  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
 +  return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) };
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqrdmulhs_s32 (int32_t __a, int32_t __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpeq_f32 (float32x4_t __a)
  {
--  return __builtin_aarch64_sqrshlv4si (__a, __b);
+-  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
 +  return __builtin_aarch64_frecpev4sf (__a);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpeq_f64 (float64x2_t __a)
  {
--  return __builtin_aarch64_sqrshlv2di (__a, __b);
+-  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
 +  return __builtin_aarch64_frecpev2df (__a);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
--{
--  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
--}
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
 +/* vrecps  */
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
++
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpss_f32 (float32_t __a, float32_t __b)
  {
--  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
+-  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
 +  return __builtin_aarch64_frecpssf (__a, __b);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+-/* vqrshl */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqrshl_s8 (int8x8_t __a, int8x8_t __b)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpsd_f64 (float64_t __a, float64_t __b)
  {
--  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
+-  return __builtin_aarch64_sqrshlv8qi (__a, __b);
 +  return __builtin_aarch64_frecpsdf (__a, __b);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqrshl_s16 (int16x4_t __a, int16x4_t __b)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecps_f32 (float32x2_t __a, float32x2_t __b)
  {
--  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
+-  return __builtin_aarch64_sqrshlv4hi (__a, __b);
 +  return __builtin_aarch64_frecpsv2sf (__a, __b);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqrshlb_s8 (int8_t __a, int8_t __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqrshl_s32 (int32x2_t __a, int32x2_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecps_f64 (float64x1_t __a, float64x1_t __b)
  {
--  return __builtin_aarch64_sqrshlqi (__a, __b);
+-  return __builtin_aarch64_sqrshlv2si (__a, __b);
 +  return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0),
 +				      vget_lane_f64 (__b, 0)) };
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrshlh_s16 (int16_t __a, int16_t __b)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vqrshl_s64 (int64x1_t __a, int64x1_t __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
  {
--  return __builtin_aarch64_sqrshlhi (__a, __b);
+-  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
 +  return __builtin_aarch64_frecpsv4sf (__a, __b);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrshls_s32 (int32_t __a, int32_t __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
  {
--  return __builtin_aarch64_sqrshlsi (__a, __b);
+-  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
 +  return __builtin_aarch64_frecpsv2df (__a, __b);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqrshld_s64 (int64_t __a, int64_t __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
 +/* vrecpx  */
 +
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpxs_f32 (float32_t __a)
  {
--  return __builtin_aarch64_sqrshldi (__a, __b);
+-  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
 +  return __builtin_aarch64_frecpxsf (__a);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqrshlb_u8 (uint8_t __a, uint8_t __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrecpxd_f64 (float64_t __a)
  {
--  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
+-  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
 +  return __builtin_aarch64_frecpxdf (__a);
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqrshlh_u16 (uint16_t __a, uint16_t __b)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
 +
 +/* vrev  */
 +
@@ -38359,1295 +40103,1458 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev16_p8 (poly8x8_t a)
  {
--  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
+-  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
 +  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqrshls_u32 (uint32_t __a, uint32_t __b)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev16_s8 (int8x8_t a)
  {
--  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
+-  return __builtin_aarch64_sqrshlv16qi (__a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqrshld_u64 (uint64_t __a, uint64_t __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev16_u8 (uint8x8_t a)
  {
--  return __builtin_aarch64_uqrshldi_uus (__a, __b);
+-  return __builtin_aarch64_sqrshlv8hi (__a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
  }
  
--/* vqrshrn */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqrshrn_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev16q_p8 (poly8x16_t a)
  {
--  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
+-  return __builtin_aarch64_sqrshlv4si (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrshrn_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev16q_s8 (int8x16_t a)
  {
--  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
+-  return __builtin_aarch64_sqrshlv2di (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrshrn_n_s64 (int64x2_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev16q_u8 (uint8x16_t a)
  {
--  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
+-  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqrshrn_n_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32_p8 (poly8x8_t a)
  {
--  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
+-  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqrshrn_n_u32 (uint32x4_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
 +__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32_p16 (poly16x4_t a)
  {
--  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
+-  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
 +  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqrshrn_n_u64 (uint64x2_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32_s8 (int8x8_t a)
  {
--  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
+-  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
 -__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqrshrnh_n_s16 (int16_t __a, const int __b)
+-vqrshlb_s8 (int8_t __a, int8_t __b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32_s16 (int16x4_t a)
  {
--  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
+-  return __builtin_aarch64_sqrshlqi (__a, __b);
 +  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
  }
  
 -__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrshrns_n_s32 (int32_t __a, const int __b)
+-vqrshlh_s16 (int16_t __a, int16_t __b)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32_u8 (uint8x8_t a)
  {
--  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
+-  return __builtin_aarch64_sqrshlhi (__a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
 -__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrshrnd_n_s64 (int64_t __a, const int __b)
+-vqrshls_s32 (int32_t __a, int32_t __b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32_u16 (uint16x4_t a)
  {
--  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
+-  return __builtin_aarch64_sqrshlsi (__a, __b);
 +  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqrshrnh_n_u16 (uint16_t __a, const int __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqrshld_s64 (int64_t __a, int64_t __b)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32q_p8 (poly8x16_t a)
  {
--  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
+-  return __builtin_aarch64_sqrshldi (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqrshrns_n_u32 (uint32_t __a, const int __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqrshlb_u8 (uint8_t __a, uint8_t __b)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32q_p16 (poly16x8_t a)
  {
--  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
+-  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
 +  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqrshrnd_n_u64 (uint64_t __a, const int __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqrshlh_u16 (uint16_t __a, uint16_t __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32q_s8 (int8x16_t a)
  {
--  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
+-  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
  }
  
--/* vqrshrun */
--
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqrshrun_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqrshls_u32 (uint32_t __a, uint32_t __b)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32q_s16 (int16x8_t a)
  {
--  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
+-  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
 +  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqrshrun_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vqrshld_u64 (uint64_t __a, uint64_t __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32q_u8 (uint8x16_t a)
  {
--  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
+-  return __builtin_aarch64_uqrshldi_uus (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqrshrun_n_s64 (int64x2_t __a, const int __b)
+-/* vqrshrn */
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev32q_u16 (uint16x8_t a)
- {
--  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
++{
 +  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
++}
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqrshrunh_n_s16 (int16_t __a, const int __b)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqrshrn_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline float16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_f16 (float16x4_t __a)
  {
--  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
+-  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
 +  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrshruns_n_s32 (int32_t __a, const int __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqrshrn_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_f32 (float32x2_t a)
  {
--  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
+-  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
 +  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrshrund_n_s64 (int64_t __a, const int __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqrshrn_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline poly8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_p8 (poly8x8_t a)
  {
--  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
+-  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
  }
  
--/* vqshl */
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqrshrn_n_u16 (uint16x8_t __a, const int __b)
 +__extension__ extern __inline poly16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_p16 (poly16x4_t a)
-+{
+ {
+-  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
 +  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+}
+ }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqshl_s8 (int8x8_t __a, int8x8_t __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqrshrn_n_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_s8 (int8x8_t a)
  {
--  return __builtin_aarch64_sqshlv8qi (__a, __b);
+-  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqshl_s16 (int16x4_t __a, int16x4_t __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqrshrn_n_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_s16 (int16x4_t a)
  {
--  return __builtin_aarch64_sqshlv4hi (__a, __b);
+-  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
 +  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqshl_s32 (int32x2_t __a, int32x2_t __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqrshrnh_n_s16 (int16_t __a, const int __b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_s32 (int32x2_t a)
  {
--  return __builtin_aarch64_sqshlv2si (__a, __b);
+-  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
 +  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vqshl_s64 (int64x1_t __a, int64x1_t __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqrshrns_n_s32 (int32_t __a, const int __b)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_u8 (uint8x8_t a)
  {
--  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
+-  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
 +  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshl_u8 (uint8x8_t __a, int8x8_t __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqrshrnd_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_u16 (uint16x4_t a)
  {
--  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
+-  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
 +  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshl_u16 (uint16x4_t __a, int16x4_t __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqrshrnh_n_u16 (uint16_t __a, const int __b)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64_u32 (uint32x2_t a)
  {
--  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
+-  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
 +  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshl_u32 (uint32x2_t __a, int32x2_t __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqrshrns_n_u32 (uint32_t __a, const int __b)
 +__extension__ extern __inline float16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_f16 (float16x8_t __a)
  {
--  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
+-  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
 +  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqshl_u64 (uint64x1_t __a, int64x1_t __b)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqrshrnd_n_u64 (uint64_t __a, const int __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_f32 (float32x4_t a)
  {
--  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
+-  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
 +  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqshlq_s8 (int8x16_t __a, int8x16_t __b)
+-/* vqrshrun */
+-
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqrshrun_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline poly8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_p8 (poly8x16_t a)
  {
--  return __builtin_aarch64_sqshlv16qi (__a, __b);
+-  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqshlq_s16 (int16x8_t __a, int16x8_t __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqrshrun_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline poly16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_p16 (poly16x8_t a)
  {
--  return __builtin_aarch64_sqshlv8hi (__a, __b);
+-  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
 +  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqshlq_s32 (int32x4_t __a, int32x4_t __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqrshrun_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_s8 (int8x16_t a)
  {
--  return __builtin_aarch64_sqshlv4si (__a, __b);
+-  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqshlq_s64 (int64x2_t __a, int64x2_t __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqrshrunh_n_s16 (int16_t __a, const int __b)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_s16 (int16x8_t a)
  {
--  return __builtin_aarch64_sqshlv2di (__a, __b);
+-  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
 +  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqrshruns_n_s32 (int32_t __a, const int __b)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_s32 (int32x4_t a)
  {
--  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
+-  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
 +  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqrshrund_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_u8 (uint8x16_t a)
  {
--  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
+-  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
 +  return __builtin_shuffle (a,
 +      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
+-/* vqshl */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqshl_s8 (int8x8_t __a, int8x8_t __b)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_u16 (uint16x8_t a)
  {
--  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
+-  return __builtin_aarch64_sqshlv8qi (__a, __b);
 +  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqshl_s16 (int16x4_t __a, int16x4_t __b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrev64q_u32 (uint32x4_t a)
  {
--  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
+-  return __builtin_aarch64_sqshlv4hi (__a, __b);
 +  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshlb_s8 (int8_t __a, int8_t __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqshl_s32 (int32x2_t __a, int32x2_t __b)
 +/* vrnd  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrnd_f32 (float32x2_t __a)
  {
--  return __builtin_aarch64_sqshlqi (__a, __b);
+-  return __builtin_aarch64_sqshlv2si (__a, __b);
 +  return __builtin_aarch64_btruncv2sf (__a);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshlh_s16 (int16_t __a, int16_t __b)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vqshl_s64 (int64x1_t __a, int64x1_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrnd_f64 (float64x1_t __a)
  {
--  return __builtin_aarch64_sqshlhi (__a, __b);
+-  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
 +  return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshls_s32 (int32_t __a, int32_t __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqshl_u8 (uint8x8_t __a, int8x8_t __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndq_f32 (float32x4_t __a)
  {
--  return __builtin_aarch64_sqshlsi (__a, __b);
+-  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
 +  return __builtin_aarch64_btruncv4sf (__a);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqshld_s64 (int64_t __a, int64_t __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqshl_u16 (uint16x4_t __a, int16x4_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndq_f64 (float64x2_t __a)
  {
--  return __builtin_aarch64_sqshldi (__a, __b);
+-  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
 +  return __builtin_aarch64_btruncv2df (__a);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqshlb_u8 (uint8_t __a, uint8_t __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqshl_u32 (uint32x2_t __a, int32x2_t __b)
 +/* vrnda  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrnda_f32 (float32x2_t __a)
  {
--  return __builtin_aarch64_uqshlqi_uus (__a, __b);
+-  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
 +  return __builtin_aarch64_roundv2sf (__a);
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqshlh_u16 (uint16_t __a, uint16_t __b)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vqshl_u64 (uint64x1_t __a, int64x1_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrnda_f64 (float64x1_t __a)
  {
--  return __builtin_aarch64_uqshlhi_uus (__a, __b);
+-  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
 +  return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqshls_u32 (uint32_t __a, uint32_t __b)
-+__extension__ extern __inline float32x4_t
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vqshlq_s8 (int8x16_t __a, int8x16_t __b)
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndaq_f32 (float32x4_t __a)
  {
--  return __builtin_aarch64_uqshlsi_uus (__a, __b);
+-  return __builtin_aarch64_sqshlv16qi (__a, __b);
 +  return __builtin_aarch64_roundv4sf (__a);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqshld_u64 (uint64_t __a, uint64_t __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vqshlq_s16 (int16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndaq_f64 (float64x2_t __a)
  {
--  return __builtin_aarch64_uqshldi_uus (__a, __b);
+-  return __builtin_aarch64_sqshlv8hi (__a, __b);
 +  return __builtin_aarch64_roundv2df (__a);
  }
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqshl_n_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqshlq_s32 (int32x4_t __a, int32x4_t __b)
 +/* vrndi  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndi_f32 (float32x2_t __a)
  {
--  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
+-  return __builtin_aarch64_sqshlv4si (__a, __b);
 +  return __builtin_aarch64_nearbyintv2sf (__a);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqshl_n_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqshlq_s64 (int64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndi_f64 (float64x1_t __a)
  {
--  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
+-  return __builtin_aarch64_sqshlv2di (__a, __b);
 +  return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqshl_n_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndiq_f32 (float32x4_t __a)
  {
--  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
+-  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
 +  return __builtin_aarch64_nearbyintv4sf (__a);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vqshl_n_s64 (int64x1_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndiq_f64 (float64x2_t __a)
  {
--  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
+-  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
 +  return __builtin_aarch64_nearbyintv2df (__a);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshl_n_u8 (uint8x8_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
 +/* vrndm  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndm_f32 (float32x2_t __a)
  {
--  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
+-  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
 +  return __builtin_aarch64_floorv2sf (__a);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshl_n_u16 (uint16x4_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndm_f64 (float64x1_t __a)
  {
--  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
+-  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
 +  return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshl_n_u32 (uint32x2_t __a, const int __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqshlb_s8 (int8_t __a, int8_t __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndmq_f32 (float32x4_t __a)
  {
--  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
+-  return __builtin_aarch64_sqshlqi (__a, __b);
 +  return __builtin_aarch64_floorv4sf (__a);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqshl_n_u64 (uint64x1_t __a, const int __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqshlh_s16 (int16_t __a, int16_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndmq_f64 (float64x2_t __a)
  {
--  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
+-  return __builtin_aarch64_sqshlhi (__a, __b);
 +  return __builtin_aarch64_floorv2df (__a);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqshlq_n_s8 (int8x16_t __a, const int __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqshls_s32 (int32_t __a, int32_t __b)
 +/* vrndn  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndn_f32 (float32x2_t __a)
  {
--  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
+-  return __builtin_aarch64_sqshlsi (__a, __b);
 +  return __builtin_aarch64_frintnv2sf (__a);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqshlq_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqshld_s64 (int64_t __a, int64_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndn_f64 (float64x1_t __a)
  {
--  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
+-  return __builtin_aarch64_sqshldi (__a, __b);
 +  return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])};
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqshlq_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqshlb_u8 (uint8_t __a, uint8_t __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndnq_f32 (float32x4_t __a)
  {
--  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
+-  return __builtin_aarch64_uqshlqi_uus (__a, __b);
 +  return __builtin_aarch64_frintnv4sf (__a);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqshlq_n_s64 (int64x2_t __a, const int __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqshlh_u16 (uint16_t __a, uint16_t __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndnq_f64 (float64x2_t __a)
  {
--  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
+-  return __builtin_aarch64_uqshlhi_uus (__a, __b);
 +  return __builtin_aarch64_frintnv2df (__a);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqshlq_n_u8 (uint8x16_t __a, const int __b)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqshls_u32 (uint32_t __a, uint32_t __b)
 +/* vrndp  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndp_f32 (float32x2_t __a)
  {
--  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
+-  return __builtin_aarch64_uqshlsi_uus (__a, __b);
 +  return __builtin_aarch64_ceilv2sf (__a);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqshlq_n_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vqshld_u64 (uint64_t __a, uint64_t __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndp_f64 (float64x1_t __a)
  {
--  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
+-  return __builtin_aarch64_uqshldi_uus (__a, __b);
 +  return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqshlq_n_u32 (uint32x4_t __a, const int __b)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqshl_n_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndpq_f32 (float32x4_t __a)
  {
--  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
+-  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
 +  return __builtin_aarch64_ceilv4sf (__a);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqshlq_n_u64 (uint64x2_t __a, const int __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqshl_n_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndpq_f64 (float64x2_t __a)
  {
--  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
+-  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
 +  return __builtin_aarch64_ceilv2df (__a);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshlb_n_s8 (int8_t __a, const int __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqshl_n_s32 (int32x2_t __a, const int __b)
 +/* vrndx  */
 +
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndx_f32 (float32x2_t __a)
  {
--  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
+-  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
 +  return __builtin_aarch64_rintv2sf (__a);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshlh_n_s16 (int16_t __a, const int __b)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vqshl_n_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndx_f64 (float64x1_t __a)
  {
--  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
+-  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
 +  return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshls_n_s32 (int32_t __a, const int __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqshl_n_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndxq_f32 (float32x4_t __a)
  {
--  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
+-  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
 +  return __builtin_aarch64_rintv4sf (__a);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqshld_n_s64 (int64_t __a, const int __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqshl_n_u16 (uint16x4_t __a, const int __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrndxq_f64 (float64x2_t __a)
  {
--  return __builtin_aarch64_sqshl_ndi (__a, __b);
+-  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
 +  return __builtin_aarch64_rintv2df (__a);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqshlb_n_u8 (uint8_t __a, const int __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqshl_n_u32 (uint32x2_t __a, const int __b)
 +/* vrshl */
 +
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_s8 (int8x8_t __a, int8x8_t __b)
  {
--  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
+-  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
 +  return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b);
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqshlh_n_u16 (uint16_t __a, const int __b)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vqshl_n_u64 (uint64x1_t __a, const int __b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_s16 (int16x4_t __a, int16x4_t __b)
  {
--  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
+-  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
 +  return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqshls_n_u32 (uint32_t __a, const int __b)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vqshlq_n_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_s32 (int32x2_t __a, int32x2_t __b)
  {
--  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
+-  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
 +  return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqshld_n_u64 (uint64_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vqshlq_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_s64 (int64x1_t __a, int64x1_t __b)
  {
--  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
+-  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
 +  return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])};
  }
  
--/* vqshlu */
--
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshlu_n_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vqshlq_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_u8 (uint8x8_t __a, int8x8_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
+-  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
 +  return __builtin_aarch64_urshlv8qi_uus (__a, __b);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshlu_n_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vqshlq_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_u16 (uint16x4_t __a, int16x4_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
+-  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
 +  return __builtin_aarch64_urshlv4hi_uus (__a, __b);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshlu_n_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vqshlq_n_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_u32 (uint32x2_t __a, int32x2_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
+-  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
 +  return __builtin_aarch64_urshlv2si_uus (__a, __b);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqshlu_n_s64 (int64x1_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vqshlq_n_u16 (uint16x8_t __a, const int __b)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshl_u64 (uint64x1_t __a, int64x1_t __b)
  {
--  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
+-  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
 +  return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])};
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqshluq_n_s8 (int8x16_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vqshlq_n_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_s8 (int8x16_t __a, int8x16_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
+-  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
 +  return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqshluq_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vqshlq_n_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_s16 (int16x8_t __a, int16x8_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
+-  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
 +  return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqshluq_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqshlb_n_s8 (int8_t __a, const int __b)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_s32 (int32x4_t __a, int32x4_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
+-  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
 +  return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqshluq_n_s64 (int64x2_t __a, const int __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqshlh_n_s16 (int16_t __a, const int __b)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_s64 (int64x2_t __a, int64x2_t __b)
  {
--  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
+-  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
 +  return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshlub_n_s8 (int8_t __a, const int __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqshls_n_s32 (int32_t __a, const int __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
  {
--  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
+-  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
 +  return __builtin_aarch64_urshlv16qi_uus (__a, __b);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshluh_n_s16 (int16_t __a, const int __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqshld_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
  {
--  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
+-  return __builtin_aarch64_sqshl_ndi (__a, __b);
 +  return __builtin_aarch64_urshlv8hi_uus (__a, __b);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshlus_n_s32 (int32_t __a, const int __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqshlb_n_u8 (uint8_t __a, const int __b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
  {
--  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
+-  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
 +  return __builtin_aarch64_urshlv4si_uus (__a, __b);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqshlud_n_s64 (int64_t __a, const int __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqshlh_n_u16 (uint16_t __a, const int __b)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
  {
--  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
+-  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
 +  return __builtin_aarch64_urshlv2di_uus (__a, __b);
  }
  
--/* vqshrn */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqshrn_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqshls_n_u32 (uint32_t __a, const int __b)
 +__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshld_s64 (int64_t __a, int64_t __b)
  {
--  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
+-  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
 +  return __builtin_aarch64_srshldi (__a, __b);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqshrn_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vqshld_n_u64 (uint64_t __a, const int __b)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshld_u64 (uint64_t __a, int64_t __b)
  {
--  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
+-  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
 +  return __builtin_aarch64_urshldi_uus (__a, __b);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqshrn_n_s64 (int64x2_t __a, const int __b)
+-/* vqshlu */
 +/* vrshr */
-+
+ 
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqshlu_n_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_s8 (int8x8_t __a, const int __b)
  {
--  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
+-  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
 +  return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshrn_n_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqshlu_n_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_s16 (int16x4_t __a, const int __b)
  {
--  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
+-  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
 +  return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshrn_n_u32 (uint32x4_t __a, const int __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqshlu_n_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_s32 (int32x2_t __a, const int __b)
  {
--  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
+-  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
 +  return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshrn_n_u64 (uint64x2_t __a, const int __b)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vqshlu_n_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_s64 (int64x1_t __a, const int __b)
  {
--  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
+-  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
 +  return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)};
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshrnh_n_s16 (int16_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vqshluq_n_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_u8 (uint8x8_t __a, const int __b)
  {
--  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
+-  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
 +  return __builtin_aarch64_urshr_nv8qi_uus (__a, __b);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshrns_n_s32 (int32_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vqshluq_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_u16 (uint16x4_t __a, const int __b)
  {
--  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
+-  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
 +  return __builtin_aarch64_urshr_nv4hi_uus (__a, __b);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshrnd_n_s64 (int64_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vqshluq_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_u32 (uint32x2_t __a, const int __b)
  {
--  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
+-  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
 +  return __builtin_aarch64_urshr_nv2si_uus (__a, __b);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqshrnh_n_u16 (uint16_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vqshluq_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshr_n_u64 (uint64x1_t __a, const int __b)
  {
--  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
+-  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
 +  return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)};
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqshrns_n_u32 (uint32_t __a, const int __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqshlub_n_s8 (int8_t __a, const int __b)
 +__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_s8 (int8x16_t __a, const int __b)
  {
--  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
+-  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
 +  return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqshrnd_n_u64 (uint64_t __a, const int __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqshluh_n_s16 (int16_t __a, const int __b)
 +__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_s16 (int16x8_t __a, const int __b)
  {
--  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
+-  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
 +  return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b);
  }
  
--/* vqshrun */
--
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshrun_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqshlus_n_s32 (int32_t __a, const int __b)
 +__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_s32 (int32x4_t __a, const int __b)
  {
--  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
+-  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
 +  return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshrun_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vqshlud_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_s64 (int64x2_t __a, const int __b)
  {
--  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
+-  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
 +  return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshrun_n_s64 (int64x2_t __a, const int __b)
+-/* vqshrn */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqshrn_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_u8 (uint8x16_t __a, const int __b)
  {
--  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
+-  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
 +  return __builtin_aarch64_urshr_nv16qi_uus (__a, __b);
  }
  
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshrunh_n_s16 (int16_t __a, const int __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vqshrn_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_u16 (uint16x8_t __a, const int __b)
  {
--  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
+-  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
 +  return __builtin_aarch64_urshr_nv8hi_uus (__a, __b);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshruns_n_s32 (int32_t __a, const int __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vqshrn_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_u32 (uint32x4_t __a, const int __b)
  {
--  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
+-  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
 +  return __builtin_aarch64_urshr_nv4si_uus (__a, __b);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshrund_n_s64 (int64_t __a, const int __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqshrn_n_u16 (uint16x8_t __a, const int __b)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrq_n_u64 (uint64x2_t __a, const int __b)
  {
--  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
+-  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
 +  return __builtin_aarch64_urshr_nv2di_uus (__a, __b);
  }
  
--/* vqsub */
--
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqsubb_s8 (int8_t __a, int8_t __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqshrn_n_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrd_n_s64 (int64_t __a, const int __b)
  {
--  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
+-  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
 +  return __builtin_aarch64_srshr_ndi (__a, __b);
  }
  
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqsubh_s16 (int16_t __a, int16_t __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqshrn_n_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrshrd_n_u64 (uint64_t __a, const int __b)
  {
--  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
+-  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
 +  return __builtin_aarch64_urshr_ndi_uus (__a, __b);
  }
  
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqsubs_s32 (int32_t __a, int32_t __b)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqshrnh_n_s16 (int16_t __a, const int __b)
 +/* vrsqrte.  */
 +
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrtes_f32 (float32_t __a)
  {
--  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
+-  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
 +  return __builtin_aarch64_rsqrtesf (__a);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqsubd_s64 (int64_t __a, int64_t __b)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqshrns_n_s32 (int32_t __a, const int __b)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrted_f64 (float64_t __a)
  {
--  return __builtin_aarch64_sqsubdi (__a, __b);
+-  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
 +  return __builtin_aarch64_rsqrtedf (__a);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqsubb_u8 (uint8_t __a, uint8_t __b)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqshrnd_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrte_f32 (float32x2_t __a)
  {
--  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
+-  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
 +  return __builtin_aarch64_rsqrtev2sf (__a);
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqsubh_u16 (uint16_t __a, uint16_t __b)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqshrnh_n_u16 (uint16_t __a, const int __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrte_f64 (float64x1_t __a)
  {
--  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
+-  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
 +  return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))};
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqsubs_u32 (uint32_t __a, uint32_t __b)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqshrns_n_u32 (uint32_t __a, const int __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrteq_f32 (float32x4_t __a)
  {
--  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
+-  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
 +  return __builtin_aarch64_rsqrtev4sf (__a);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqsubd_u64 (uint64_t __a, uint64_t __b)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqshrnd_n_u64 (uint64_t __a, const int __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrteq_f64 (float64x2_t __a)
  {
--  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
+-  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
 +  return __builtin_aarch64_rsqrtev2df (__a);
  }
  
--/* vqtbl2 */
+-/* vqshrun */
 +/* vrsqrts.  */
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqshrun_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline float32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrtss_f32 (float32_t __a, float32_t __b)
  {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
--  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
 +  return __builtin_aarch64_rsqrtssf (__a, __b);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vqshrun_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline float64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrtsd_f64 (float64_t __a, float64_t __b)
  {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
 +  return __builtin_aarch64_rsqrtsdf (__a, __b);
  }
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vqshrun_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
  {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
 +  return __builtin_aarch64_rsqrtsv2sf (__a, __b);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqshrunh_n_s16 (int16_t __a, const int __b)
 +__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrts_f64 (float64x1_t __a, float64x1_t __b)
  {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+-  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
 +  return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0),
 +				      vget_lane_f64 (__b, 0))};
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqshruns_n_s32 (int32_t __a, const int __b)
 +__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
  {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+-  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
 +  return __builtin_aarch64_rsqrtsv4sf (__a, __b);
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqshrund_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b)
  {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+-  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
 +  return __builtin_aarch64_rsqrtsv2df (__a, __b);
  }
  
--/* vqtbl3 */
+-/* vqsub */
 +/* vrsra */
  
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+-vqsubb_s8 (int8_t __a, int8_t __b)
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
  {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+-  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
 +  return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+-vqsubh_s16 (int16_t __a, int16_t __b)
 +__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
  {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+-  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
 +  return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c);
  }
  
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+-vqsubs_s32 (int32_t __a, int32_t __b)
 +__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
  {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+-  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
 +  return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vqsubd_s64 (int64_t __a, int64_t __b)
 +__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
  {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+-  return __builtin_aarch64_sqsubdi (__a, __b);
 +  return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)};
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vqsubb_u8 (uint8_t __a, uint8_t __b)
 +__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
  {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+-  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
 +  return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c);
  }
  
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vqsubh_u16 (uint16_t __a, uint16_t __b)
 +__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
  {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+-  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
 +  return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c);
  }
  
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vqsubs_u32 (uint32_t __a, uint32_t __b)
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+-  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
++  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vqsubd_u64 (uint64_t __a, uint64_t __b)
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+ {
+-  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
+ }
+ 
+-/* vqtbl2 */
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
++{
++  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
++}
+ 
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+-  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
++  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
++  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
++  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
++  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
++  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
++  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
+ }
+ 
+-/* vqtbl3 */
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
++{
++  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
++}
+ 
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
++__extension__ extern __inline int64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
++  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
++  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
+ }
+ 
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+-vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
++#pragma GCC push_options
++#pragma GCC target ("+nothing+crypto")
++
++/* vsha1  */
++
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
++  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
+ }
+ 
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
++  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
+ }
+ 
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
++  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
++}
++
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsha1h_u32 (uint32_t hash_e)
++{
++  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
+ }
+ 
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
++  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
+ }
+ 
 -/* vqtbl4 */
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
++vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39655,14 +41562,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
++  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
++vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39670,14 +41577,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
-+  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
++  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
++vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39685,14 +41592,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
-+  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
++  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
++vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39700,14 +41607,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
-+  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
++  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
++vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39715,14 +41622,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
-+  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
++  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline poly128_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
++vmull_p64 (poly64_t a, poly64_t b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39730,118 +41637,118 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
-+  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
++  return
++    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
  }
  
 -
 -/* vqtbx2 */
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline poly128_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
++vmull_high_p64 (poly64x2_t a, poly64x2_t b)
  {
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
 -  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
-+  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
++  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint16x8_t
++#pragma GCC pop_options
++
++/* vshl */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
++vshl_n_s8 (int8x8_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 -  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
 -						(int8x8_t)idx);
-+  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
++  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
++vshl_n_s16 (int16x4_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 -  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
 -						(int8x8_t)idx);
-+  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
++  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
++vshl_n_s32 (int32x2_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
 -  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
-+  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
++  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
++vshl_n_s64 (int64x1_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 -  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
 -						  (int8x16_t)idx);
-+  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
++  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
++vshl_n_u8 (uint8x8_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_oi __o;
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
 -  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
 -						  (int8x16_t)idx);
-+  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
++  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
  }
  
 -/* vqtbx3 */
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
-+#pragma GCC push_options
-+#pragma GCC target ("+nothing+crypto")
-+
-+/* vsha1  */
-+
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
++vshl_n_u16 (uint16x4_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
 -  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
++  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
++vshl_n_u32 (uint32x2_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39849,14 +41756,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
 -						 (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
++  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
++vshl_n_u64 (uint64x1_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39864,28 +41771,28 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
 -						 (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
++  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1h_u32 (uint32_t hash_e)
++vshlq_n_s8 (int8x16_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
 -  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
++  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
++vshlq_n_s16 (int16x8_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39893,14 +41800,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
 -						   (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
++  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
++vshlq_n_s32 (int32x4_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_ci __o;
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39908,22 +41815,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
 -  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
 -						   (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
++  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
  }
  
 -/* vqtbx4 */
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
-+{
-+  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
-+}
- 
+-
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
++vshlq_n_s64 (int64x2_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
@@ -39931,14 +41832,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
 -  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
++  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
++vshlq_n_u8 (uint8x16_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39947,14 +41848,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
 -						 (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
++  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
++vshlq_n_u16 (uint16x8_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39963,14 +41864,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
 -						 (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
++  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline poly128_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_p64 (poly64_t a, poly64_t b)
++vshlq_n_u32 (uint32x4_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
@@ -39978,15 +41879,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
 -  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
-+  return
-+    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
++  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline poly128_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_p64 (poly64x2_t a, poly64x2_t b)
++vshlq_n_u64 (uint64x2_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -39995,18 +41895,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
 -						   (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
++  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
-+#pragma GCC pop_options
-+
-+/* vshl */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s8 (int8x8_t __a, const int __b)
++vshld_n_s64 (int64_t __a, const int __b)
  {
 -  __builtin_aarch64_simd_xi __o;
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
@@ -40015,228 +41911,216 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
 -  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
 -						   (int8x16_t)idx);
-+  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
++  return __builtin_aarch64_ashldi (__a, __b);
  }
  
 -/* vrbit  */
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s16 (int16x4_t __a, const int __b)
-+{
-+  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
-+}
- 
+-
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vrbit_p8 (poly8x8_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s32 (int32x2_t __a, const int __b)
++vshld_n_u64 (uint64_t __a, const int __b)
  {
 -  return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
-+  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
++  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrbit_s8 (int8x8_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s64 (int64x1_t __a, const int __b)
++vshl_s8 (int8x8_t __a, int8x8_t __b)
  {
 -  return __builtin_aarch64_rbitv8qi (__a);
-+  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
++  return __builtin_aarch64_sshlv8qi (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrbit_u8 (uint8x8_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u8 (uint8x8_t __a, const int __b)
++vshl_s16 (int16x4_t __a, int16x4_t __b)
  {
 -  return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
-+  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
++  return __builtin_aarch64_sshlv4hi (__a, __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vrbitq_p8 (poly8x16_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u16 (uint16x4_t __a, const int __b)
++vshl_s32 (int32x2_t __a, int32x2_t __b)
  {
 -  return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a);
-+  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
++  return __builtin_aarch64_sshlv2si (__a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrbitq_s8 (int8x16_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u32 (uint32x2_t __a, const int __b)
++vshl_s64 (int64x1_t __a, int64x1_t __b)
  {
 -  return __builtin_aarch64_rbitv16qi (__a);
-+  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
++  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrbitq_u8 (uint8x16_t __a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u64 (uint64x1_t __a, const int __b)
++vshl_u8 (uint8x8_t __a, int8x8_t __b)
  {
 -  return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a);
-+  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
++  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
  }
  
 -/* vrecpe  */
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s8 (int8x16_t __a, const int __b)
-+{
-+  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
-+}
- 
+-
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vrecpe_u32 (uint32x2_t __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s16 (int16x8_t __a, const int __b)
++vshl_u16 (uint16x4_t __a, int16x4_t __b)
  {
 -  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
-+  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
++  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
  }
 - 
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vrecpeq_u32 (uint32x4_t __a)
 +
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s32 (int32x4_t __a, const int __b)
++vshl_u32 (uint32x2_t __a, int32x2_t __b)
  {
 -  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
-+  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
++  return __builtin_aarch64_ushlv2si_uus (__a, __b);
  }
  
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vrecpes_f32 (float32_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s64 (int64x2_t __a, const int __b)
++vshl_u64 (uint64x1_t __a, int64x1_t __b)
  {
 -  return __builtin_aarch64_frecpesf (__a);
-+  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vrecped_f64 (float64_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u8 (uint8x16_t __a, const int __b)
++vshlq_s8 (int8x16_t __a, int8x16_t __b)
  {
 -  return __builtin_aarch64_frecpedf (__a);
-+  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
++  return __builtin_aarch64_sshlv16qi (__a, __b);
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrecpe_f32 (float32x2_t __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u16 (uint16x8_t __a, const int __b)
++vshlq_s16 (int16x8_t __a, int16x8_t __b)
  {
 -  return __builtin_aarch64_frecpev2sf (__a);
-+  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
++  return __builtin_aarch64_sshlv8hi (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrecpeq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u32 (uint32x4_t __a, const int __b)
++vshlq_s32 (int32x4_t __a, int32x4_t __b)
  {
 -  return __builtin_aarch64_frecpev4sf (__a);
-+  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
++  return __builtin_aarch64_sshlv4si (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrecpeq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u64 (uint64x2_t __a, const int __b)
++vshlq_s64 (int64x2_t __a, int64x2_t __b)
  {
 -  return __builtin_aarch64_frecpev2df (__a);
-+  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
++  return __builtin_aarch64_sshlv2di (__a, __b);
  }
  
 -/* vrecps  */
 -
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vrecpss_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_n_s64 (int64_t __a, const int __b)
++vshlq_u8 (uint8x16_t __a, int8x16_t __b)
  {
 -  return __builtin_aarch64_frecpssf (__a, __b);
-+  return __builtin_aarch64_ashldi (__a, __b);
++  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vrecpsd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_n_u64 (uint64_t __a, const int __b)
++vshlq_u16 (uint16x8_t __a, int16x8_t __b)
  {
 -  return __builtin_aarch64_frecpsdf (__a, __b);
-+  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
++  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrecps_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s8 (int8x8_t __a, int8x8_t __b)
++vshlq_u32 (uint32x4_t __a, int32x4_t __b)
  {
 -  return __builtin_aarch64_frecpsv2sf (__a, __b);
-+  return __builtin_aarch64_sshlv8qi (__a, __b);
++  return __builtin_aarch64_ushlv4si_uus (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s16 (int16x4_t __a, int16x4_t __b)
++vshlq_u64 (uint64x2_t __a, int64x2_t __b)
  {
 -  return __builtin_aarch64_frecpsv4sf (__a, __b);
-+  return __builtin_aarch64_sshlv4hi (__a, __b);
++  return __builtin_aarch64_ushlv2di_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s32 (int32x2_t __a, int32x2_t __b)
++vshld_s64 (int64_t __a, int64_t __b)
  {
 -  return __builtin_aarch64_frecpsv2df (__a, __b);
-+  return __builtin_aarch64_sshlv2si (__a, __b);
++  return __builtin_aarch64_sshldi (__a, __b);
  }
  
 -/* vrecpx  */
 -
 -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
 -vrecpxs_f32 (float32_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s64 (int64x1_t __a, int64x1_t __b)
++vshld_u64 (uint64_t __a, uint64_t __b)
  {
 -  return __builtin_aarch64_frecpxsf (__a);
-+  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
++  return __builtin_aarch64_ushldi_uus (__a, __b);
  }
  
 -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
 -vrecpxd_f64 (float64_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u8 (uint8x8_t __a, int8x8_t __b)
++vshll_high_n_s8 (int8x16_t __a, const int __b)
  {
 -  return __builtin_aarch64_frecpxdf (__a);
-+  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
++  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
  }
  
 -
@@ -40244,1201 +42128,1200 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vrev16_p8 (poly8x8_t a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u16 (uint16x4_t __a, int16x4_t __b)
++vshll_high_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
++  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrev16_s8 (int8x8_t a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u32 (uint32x2_t __a, int32x2_t __b)
++vshll_high_n_s32 (int32x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return __builtin_aarch64_ushlv2si_uus (__a, __b);
++  return __builtin_aarch64_sshll2_nv4si (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrev16_u8 (uint8x8_t a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u64 (uint64x1_t __a, int64x1_t __b)
++vshll_high_n_u8 (uint8x16_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
++  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vrev16q_p8 (poly8x16_t a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s8 (int8x16_t __a, int8x16_t __b)
++vshll_high_n_u16 (uint16x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-+  return __builtin_aarch64_sshlv16qi (__a, __b);
++  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrev16q_s8 (int8x16_t a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s16 (int16x8_t __a, int16x8_t __b)
++vshll_high_n_u32 (uint32x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-+  return __builtin_aarch64_sshlv8hi (__a, __b);
++  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrev16q_u8 (uint8x16_t a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s32 (int32x4_t __a, int32x4_t __b)
++vshll_n_s8 (int8x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-+  return __builtin_aarch64_sshlv4si (__a, __b);
++  return __builtin_aarch64_sshll_nv8qi (__a, __b);
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vrev32_p8 (poly8x8_t a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s64 (int64x2_t __a, int64x2_t __b)
++vshll_n_s16 (int16x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_sshlv2di (__a, __b);
++  return __builtin_aarch64_sshll_nv4hi (__a, __b);
  }
  
 -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 -vrev32_p16 (poly16x4_t a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u8 (uint8x16_t __a, int8x16_t __b)
++vshll_n_s32 (int32x2_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
++  return __builtin_aarch64_sshll_nv2si (__a, __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrev32_s8 (int8x8_t a)
 +__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u16 (uint16x8_t __a, int16x8_t __b)
++vshll_n_u8 (uint8x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
++  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vrev32_s16 (int16x4_t a)
 +__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u32 (uint32x4_t __a, int32x4_t __b)
++vshll_n_u16 (uint16x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_ushlv4si_uus (__a, __b);
++  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrev32_u8 (uint8x8_t a)
 +__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u64 (uint64x2_t __a, int64x2_t __b)
++vshll_n_u32 (uint32x2_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_ushlv2di_uus (__a, __b);
++  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vrev32_u16 (uint16x4_t a)
-+__extension__ extern __inline int64_t
++/* vshr */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_s64 (int64_t __a, int64_t __b)
++vshr_n_s8 (int8x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_sshldi (__a, __b);
++  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vrev32q_p8 (poly8x16_t a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_u64 (uint64_t __a, uint64_t __b)
++vshr_n_s16 (int16x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
-+  return __builtin_aarch64_ushldi_uus (__a, __b);
++  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
  }
  
 -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 -vrev32q_p16 (poly16x8_t a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_s8 (int8x16_t __a, const int __b)
++vshr_n_s32 (int32x2_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
++  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrev32q_s8 (int8x16_t a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_s16 (int16x8_t __a, const int __b)
++vshr_n_s64 (int64x1_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
-+  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
++  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vrev32q_s16 (int16x8_t a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_s32 (int32x4_t __a, const int __b)
++vshr_n_u8 (uint8x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return __builtin_aarch64_sshll2_nv4si (__a, __b);
++  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrev32q_u8 (uint8x16_t a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_u8 (uint8x16_t __a, const int __b)
++vshr_n_u16 (uint16x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
-+  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
++  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vrev32q_u16 (uint16x8_t a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_u16 (uint16x8_t __a, const int __b)
++vshr_n_u32 (uint32x2_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
++  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrev64_f32 (float32x2_t a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_u32 (uint32x4_t __a, const int __b)
++vshr_n_u64 (uint64x1_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
-+  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
++  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
  }
  
 -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 -vrev64_p8 (poly8x8_t a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_s8 (int8x8_t __a, const int __b)
++vshrq_n_s8 (int8x16_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
-+  return __builtin_aarch64_sshll_nv8qi (__a, __b);
++  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
  }
  
 -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 -vrev64_p16 (poly16x4_t a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_s16 (int16x4_t __a, const int __b)
++vshrq_n_s16 (int16x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+  return __builtin_aarch64_sshll_nv4hi (__a, __b);
++  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
  }
  
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrev64_s8 (int8x8_t a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_s32 (int32x2_t __a, const int __b)
++vshrq_n_s32 (int32x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
-+  return __builtin_aarch64_sshll_nv2si (__a, __b);
++  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vrev64_s16 (int16x4_t a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_u8 (uint8x8_t __a, const int __b)
++vshrq_n_s64 (int64x2_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
++  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vrev64_s32 (int32x2_t a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_u16 (uint16x4_t __a, const int __b)
++vshrq_n_u8 (uint8x16_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
-+  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
++  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrev64_u8 (uint8x8_t a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_u32 (uint32x2_t __a, const int __b)
++vshrq_n_u16 (uint16x8_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
-+  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
++  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vrev64_u16 (uint16x4_t a)
-+/* vshr */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s8 (int8x8_t __a, const int __b)
++vshrq_n_u32 (uint32x4_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
++  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vrev64_u32 (uint32x2_t a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s16 (int16x4_t __a, const int __b)
++vshrq_n_u64 (uint64x2_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
-+  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
++  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrev64q_f32 (float32x4_t a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s32 (int32x2_t __a, const int __b)
++vshrd_n_s64 (int64_t __a, const int __b)
  {
 -  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
-+  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
++  return __builtin_aarch64_ashr_simddi (__a, __b);
  }
  
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 -vrev64q_p8 (poly8x16_t a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s64 (int64x1_t __a, const int __b)
++vshrd_n_u64 (uint64_t __a, const int __b)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
-+  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
++  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
  }
  
 -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 -vrev64q_p16 (poly16x8_t a)
-+__extension__ extern __inline uint8x8_t
++/* vsli */
++
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u8 (uint8x8_t __a, const int __b)
++vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
  {
 -  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
++  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrev64q_s8 (int8x16_t a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u16 (uint16x4_t __a, const int __b)
++vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
-+  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
++  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vrev64q_s16 (int16x8_t a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u32 (uint32x2_t __a, const int __b)
++vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
  {
 -  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
++  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vrev64q_s32 (int32x4_t a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u64 (uint64x1_t __a, const int __b)
++vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
  {
 -  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
-+  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
++  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrev64q_u8 (uint8x16_t a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s8 (int8x16_t __a, const int __b)
++vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
  {
 -  return __builtin_shuffle (a,
 -      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
-+  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
++  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vrev64q_u16 (uint16x8_t a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s16 (int16x8_t __a, const int __b)
++vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
  {
 -  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
++  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vrev64q_u32 (uint32x4_t a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s32 (int32x4_t __a, const int __b)
++vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
  {
 -  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
-+  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
++  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
  }
  
 -/* vrnd  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrnd_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s64 (int64x2_t __a, const int __b)
++vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
  {
 -  return __builtin_aarch64_btruncv2sf (__a);
-+  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrnd_f64 (float64x1_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline poly64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u8 (uint8x16_t __a, const int __b)
++vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
  {
 -  return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0);
-+  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
++  return (poly64x1_t) {__builtin_aarch64_ssli_ndi_ppps (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u16 (uint16x8_t __a, const int __b)
++vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
  {
 -  return __builtin_aarch64_btruncv4sf (__a);
-+  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
++  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u32 (uint32x4_t __a, const int __b)
++vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
  {
 -  return __builtin_aarch64_btruncv2df (__a);
-+  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
++  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
  }
  
 -/* vrnda  */
--
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
++{
++  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
++}
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrnda_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u64 (uint64x2_t __a, const int __b)
++vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
  {
 -  return __builtin_aarch64_roundv2sf (__a);
-+  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
++  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrnda_f64 (float64x1_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrd_n_s64 (int64_t __a, const int __b)
++vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
  {
 -  return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_ashr_simddi (__a, __b);
++  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndaq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrd_n_u64 (uint64_t __a, const int __b)
++vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
  {
 -  return __builtin_aarch64_roundv4sf (__a);
-+  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
++  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndaq_f64 (float64x2_t __a)
-+/* vsli */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
++vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
  {
 -  return __builtin_aarch64_roundv2df (__a);
-+  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
++  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
  }
  
 -/* vrndi  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrndi_f32 (float32x2_t __a)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
++vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
  {
 -  return __builtin_aarch64_nearbyintv2sf (__a);
-+  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
++  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrndi_f64 (float64x1_t __a)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline poly64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
++vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
  {
 -  return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0);
-+  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
++  return __builtin_aarch64_ssli_nv2di_ppps (__a, __b, __c);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndiq_f32 (float32x4_t __a)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
++vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
  {
 -  return __builtin_aarch64_nearbyintv4sf (__a);
-+  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
++  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndiq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
++vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
  {
 -  return __builtin_aarch64_nearbyintv2df (__a);
-+  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
  }
  
 -/* vrndm  */
--
++/* vsqadd */
+ 
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrndm_f32 (float32x2_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
++vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
  {
 -  return __builtin_aarch64_floorv2sf (__a);
-+  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrndm_f64 (float64x1_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
++vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
  {
 -  return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
++  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndmq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
++vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
  {
 -  return __builtin_aarch64_floorv4sf (__a);
-+  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
++  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndmq_f64 (float64x2_t __a)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
++vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
  {
 -  return __builtin_aarch64_floorv2df (__a);
-+  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
++  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
  }
  
 -/* vrndn  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrndn_f32 (float32x2_t __a)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
++vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
  {
 -  return __builtin_aarch64_frintnv2sf (__a);
-+  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
++  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrndn_f64 (float64x1_t __a)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
++vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
  {
 -  return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])};
-+  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
++  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndnq_f32 (float32x4_t __a)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
++vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
  {
 -  return __builtin_aarch64_frintnv4sf (__a);
-+  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
++  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndnq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
++vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
  {
 -  return __builtin_aarch64_frintnv2df (__a);
-+  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
  }
  
 -/* vrndp  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrndp_f32 (float32x2_t __a)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
++vsqaddb_u8 (uint8_t __a, int8_t __b)
  {
 -  return __builtin_aarch64_ceilv2sf (__a);
-+  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usqaddqi_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrndp_f64 (float64x1_t __a)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline uint16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
++vsqaddh_u16 (uint16_t __a, int16_t __b)
  {
 -  return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
++  return __builtin_aarch64_usqaddhi_uus (__a, __b);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndpq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint32_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
++vsqadds_u32 (uint32_t __a, int32_t __b)
  {
 -  return __builtin_aarch64_ceilv4sf (__a);
-+  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
++  return __builtin_aarch64_usqaddsi_uus (__a, __b);
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndpq_f64 (float64x2_t __a)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
++vsqaddd_u64 (uint64_t __a, int64_t __b)
  {
 -  return __builtin_aarch64_ceilv2df (__a);
-+  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
++  return __builtin_aarch64_usqadddi_uus (__a, __b);
  }
  
 -/* vrndx  */
 -
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 -vrndx_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64_t
++/* vsqrt */
++__extension__ extern __inline float32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
++vsqrt_f32 (float32x2_t a)
  {
 -  return __builtin_aarch64_rintv2sf (__a);
-+  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
++  return __builtin_aarch64_sqrtv2sf (a);
  }
  
 -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 -vrndx_f64 (float64x1_t __a)
-+/* vsqadd */
-+
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline float32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
++vsqrtq_f32 (float32x4_t a)
  {
 -  return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
++  return __builtin_aarch64_sqrtv4sf (a);
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 -vrndxq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline float64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
++vsqrt_f64 (float64x1_t a)
  {
 -  return __builtin_aarch64_rintv4sf (__a);
-+  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
++  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
  }
  
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 -vrndxq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline float64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
++vsqrtq_f64 (float64x2_t a)
  {
 -  return __builtin_aarch64_rintv2df (__a);
-+  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
++  return __builtin_aarch64_sqrtv2df (a);
  }
  
 -/* vrshl */
--
++/* vsra */
+ 
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrshl_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
++vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
  {
 -  return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
++  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vrshl_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
++vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
  {
 -  return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b);
-+  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
++  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vrshl_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
++vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
  {
 -  return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b);
-+  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
++  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vrshl_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
++vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
  {
 -  return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])};
-+  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
++  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrshl_u8 (uint8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
++vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv8qi_uus (__a, __b);
-+  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
++  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vrshl_u16 (uint16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint8_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddb_u8 (uint8_t __a, int8_t __b)
++vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv4hi_uus (__a, __b);
-+  return __builtin_aarch64_usqaddqi_uus (__a, __b);
++  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vrshl_u32 (uint32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint16_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddh_u16 (uint16_t __a, int16_t __b)
++vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv2si_uus (__a, __b);
-+  return __builtin_aarch64_usqaddhi_uus (__a, __b);
++  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vrshl_u64 (uint64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline uint32_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadds_u32 (uint32_t __a, int32_t __b)
++vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
  {
 -  return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])};
-+  return __builtin_aarch64_usqaddsi_uus (__a, __b);
++  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrshlq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddd_u64 (uint64_t __a, int64_t __b)
++vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
  {
 -  return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b);
-+  return __builtin_aarch64_usqadddi_uus (__a, __b);
++  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vrshlq_s16 (int16x8_t __a, int16x8_t __b)
-+/* vsqrt */
-+__extension__ extern __inline float32x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrt_f32 (float32x2_t a)
++vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
  {
 -  return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b);
-+  return __builtin_aarch64_sqrtv2sf (a);
++  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vrshlq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline float32x4_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrtq_f32 (float32x4_t a)
++vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
  {
 -  return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b);
-+  return __builtin_aarch64_sqrtv4sf (a);
++  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vrshlq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float64x1_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrt_f64 (float64x1_t a)
++vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
  {
 -  return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b);
-+  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
++  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float64x2_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrtq_f64 (float64x2_t a)
++vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv16qi_uus (__a, __b);
-+  return __builtin_aarch64_sqrtv2df (a);
++  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
-+/* vsra */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
++vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv8hi_uus (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
++  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
++vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv4si_uus (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
++  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
++vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshlv2di_uus (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
++  return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vrshld_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
++vsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
  {
 -  return __builtin_aarch64_srshldi (__a, __b);
-+  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
++  return __builtin_aarch64_ssra_ndi (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vrshld_u64 (uint64_t __a, int64_t __b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
++vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshldi_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c);
  }
  
 -/* vrshr */
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrshr_n_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
+-{
 -  return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b);
-+  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
- }
+-}
++/* vsri */
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vrshr_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline uint32x2_t
++__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
++vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
  {
 -  return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b);
-+  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
++  return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vrshr_n_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline int16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
++vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
  {
 -  return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
++  return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vrshr_n_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline int32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
++vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
  {
 -  return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)};
-+  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
++  return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrshr_n_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline int64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
++vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv8qi_uus (__a, __b);
-+  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
++  return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vrshr_n_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline uint8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
++vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv4hi_uus (__a, __b);
-+  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
++  return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vrshr_n_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline uint16x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
++vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv2si_uus (__a, __b);
-+  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
++  return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vrshr_n_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline uint32x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
++vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
  {
 -  return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)};
-+  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrshrq_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline uint64x1_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
++vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
  {
 -  return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b);
-+  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
++  return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)};
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vrshrq_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline int8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
++vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
  {
 -  return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b);
-+  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
++  return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vrshrq_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline int16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
++vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
  {
 -  return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b);
-+  return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c);
++  return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vrshrq_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline int32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
++vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
  {
 -  return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b);
-+  return __builtin_aarch64_ssra_ndi (__a, __b, __c);
++  return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c);
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrshrq_n_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline int64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
++vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv16qi_uus (__a, __b);
-+  return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c);
++  return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vrshrq_n_u16 (uint16x8_t __a, const int __b)
-+/* vsri */
-+
-+__extension__ extern __inline int8x8_t
++__extension__ extern __inline uint8x16_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
++vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv8hi_uus (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c);
++  return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vrshrq_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline int16x4_t
++__extension__ extern __inline uint16x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
++vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv4si_uus (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c);
++  return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vrshrq_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline int32x2_t
++__extension__ extern __inline uint32x4_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
++vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_nv2di_uus (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c);
++  return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vrshrd_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline int64x1_t
++__extension__ extern __inline uint64x2_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
++vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
  {
 -  return __builtin_aarch64_srshr_ndi (__a, __b);
-+  return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)};
++  return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vrshrd_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline uint8x8_t
++__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
++vsrid_n_s64 (int64_t __a, int64_t __b, const int __c)
  {
 -  return __builtin_aarch64_urshr_ndi_uus (__a, __b);
-+  return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c);
++  return __builtin_aarch64_ssri_ndi (__a, __b, __c);
  }
  
 -/* vrsra */
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-+__extension__ extern __inline uint16x4_t
++__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
++vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
  {
 -  return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c);
-+  return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c);
++  return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
+-{
 -  return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c);
-+  return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c);
- }
+-}
++/* vst1 */
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline uint64x1_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
++vst1_f16 (float16_t *__a, float16x4_t __b)
  {
 -  return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c);
-+  return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)};
++  __builtin_aarch64_st1v4hf (__a, __b);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-+__extension__ extern __inline int8x16_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
++vst1_f32 (float32_t *a, float32x2_t b)
  {
 -  return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)};
-+  return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c);
++  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-+__extension__ extern __inline int16x8_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
++vst1_f64 (float64_t *a, float64x1_t b)
  {
 -  return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c);
-+  return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c);
++  *a = b[0];
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-+__extension__ extern __inline int32x4_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
++vst1_p8 (poly8_t *a, poly8x8_t b)
  {
 -  return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c);
-+  return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c);
++  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
++			     (int8x8_t) b);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-+__extension__ extern __inline int64x2_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
++vst1_p16 (poly16_t *a, poly16x4_t b)
  {
 -  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
-+  return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c);
++  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
++			     (int16x4_t) b);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-+__extension__ extern __inline uint8x16_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
++vst1_p64 (poly64_t *a, poly64x1_t b)
  {
 -  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
-+  return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c);
++  *a = b[0];
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-+__extension__ extern __inline uint16x8_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
++vst1_s8 (int8_t *a, int8x8_t b)
  {
 -  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
-+  return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c);
++  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint32x4_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
++vst1_s16 (int16_t *a, int16x4_t b)
  {
 -  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
-+  return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c);
++  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
  }
  
 -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 -vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint64x2_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
++vst1_s32 (int32_t *a, int32x2_t b)
  {
 -  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
-+  return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c);
++  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
  }
  
 -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 -vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-+__extension__ extern __inline int64_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrid_n_s64 (int64_t __a, int64_t __b, const int __c)
++vst1_s64 (int64_t *a, int64x1_t b)
  {
 -  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
-+  return __builtin_aarch64_ssri_ndi (__a, __b, __c);
++  *a = b[0];
  }
  
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 -vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-+__extension__ extern __inline uint64_t
++__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
++vst1_u8 (uint8_t *a, uint8x8_t b)
  {
 -  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
-+  return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c);
++  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
++			     (int8x8_t) b);
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-+/* vst1 */
-+
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_f16 (float16_t *__a, float16x4_t __b)
++vst1_u16 (uint16_t *a, uint16x4_t b)
  {
 -  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v4hf (__a, __b);
++  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
++			     (int16x4_t) b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_f32 (float32_t *a, float32x2_t b)
++vst1_u32 (uint32_t *a, uint32x2_t b)
  {
 -  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
++  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
++			     (int32x2_t) b);
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_f64 (float64_t *a, float64x1_t b)
++vst1_u64 (uint64_t *a, uint64x1_t b)
  {
 -  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
 +  *a = b[0];
@@ -41446,24 +43329,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
++/* vst1q */
++
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_p8 (poly8_t *a, poly8x8_t b)
++vst1q_f16 (float16_t *__a, float16x8_t __b)
  {
 -  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
-+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-+			     (int8x8_t) b);
++  __builtin_aarch64_st1v8hf (__a, __b);
  }
  
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_p16 (poly16_t *a, poly16x4_t b)
++vst1q_f32 (float32_t *a, float32x4_t b)
  {
 -  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x4_t) b);
++  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
  }
  
 -#pragma GCC push_options
@@ -41475,235 +43358,236 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s8 (int8_t *a, int8x8_t b)
++vst1q_f64 (float64_t *a, float64x2_t b)
  {
 -  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
-+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
++  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s16 (int16_t *a, int16x4_t b)
++vst1q_p8 (poly8_t *a, poly8x16_t b)
  {
 -  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
-+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
++  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
++			      (int8x16_t) b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s32 (int32_t *a, int32x2_t b)
++vst1q_p16 (poly16_t *a, poly16x8_t b)
  {
 -  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
-+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
++  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
++			     (int16x8_t) b);
  }
  
 -__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 -vsha1h_u32 (uint32_t hash_e)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s64 (int64_t *a, int64x1_t b)
++vst1q_p64 (poly64_t *a, poly64x2_t b)
  {
 -  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
-+  *a = b[0];
++  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a,
++				(poly64x2_t) b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u8 (uint8_t *a, uint8x8_t b)
++vst1q_s8 (int8_t *a, int8x16_t b)
  {
 -  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
-+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-+			     (int8x8_t) b);
++  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u16 (uint16_t *a, uint16x4_t b)
++vst1q_s16 (int16_t *a, int16x8_t b)
  {
 -  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
-+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x4_t) b);
++  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u32 (uint32_t *a, uint32x2_t b)
++vst1q_s32 (int32_t *a, int32x4_t b)
  {
 -  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
-+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
-+			     (int32x2_t) b);
++  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u64 (uint64_t *a, uint64x1_t b)
++vst1q_s64 (int64_t *a, int64x2_t b)
  {
 -  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
-+  *a = b[0];
++  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
-+/* vst1q */
-+
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_f16 (float16_t *__a, float16x8_t __b)
++vst1q_u8 (uint8_t *a, uint8x16_t b)
  {
 -  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
-+  __builtin_aarch64_st1v8hf (__a, __b);
++  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
++			      (int8x16_t) b);
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 -vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_f32 (float32_t *a, float32x4_t b)
++vst1q_u16 (uint16_t *a, uint16x8_t b)
  {
 -  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
-+  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
++  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
++			     (int16x8_t) b);
  }
  
 -__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
 -vmull_p64 (poly64_t a, poly64_t b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_f64 (float64_t *a, float64x2_t b)
++vst1q_u32 (uint32_t *a, uint32x4_t b)
  {
 -  return
 -    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
-+  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
++  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
++			     (int32x4_t) b);
  }
  
 -__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
 -vmull_high_p64 (poly64x2_t a, poly64x2_t b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_p8 (poly8_t *a, poly8x16_t b)
++vst1q_u64 (uint64_t *a, uint64x2_t b)
  {
 -  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
-+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-+			      (int8x16_t) b);
++  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
++			     (int64x2_t) b);
  }
  
 -#pragma GCC pop_options
--
++/* vst1_lane */
+ 
 -/* vshl */
--
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
++{
++  *__a = __aarch64_vget_lane_any (__b, __lane);
++}
+ 
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -vshl_n_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_p16 (poly16_t *a, poly16x8_t b)
++vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
  {
 -  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
-+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x8_t) b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -vshl_n_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s8 (int8_t *a, int8x16_t b)
++vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane)
  {
 -  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
-+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -vshl_n_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s16 (int16_t *a, int16x8_t b)
++vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane)
  {
 -  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
-+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 -vshl_n_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s32 (int32_t *a, int32x4_t b)
++vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
  {
 -  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
-+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vshl_n_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s64 (int64_t *a, int64x2_t b)
++vst1_lane_p64 (poly64_t *__a, poly64x1_t __b, const int __lane)
  {
 -  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
-+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -vshl_n_u16 (uint16x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u8 (uint8_t *a, uint8x16_t b)
++vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
  {
 -  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
-+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-+			      (int8x16_t) b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -vshl_n_u32 (uint32x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u16 (uint16_t *a, uint16x8_t b)
++vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane)
  {
 -  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
-+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x8_t) b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 -vshl_n_u64 (uint64x1_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u32 (uint32_t *a, uint32x4_t b)
++vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane)
  {
 -  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
-+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
-+			     (int32x4_t) b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 -vshlq_n_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u64 (uint64_t *a, uint64x2_t b)
++vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane)
  {
 -  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
-+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
-+			     (int64x2_t) b);
++  *__a = __aarch64_vget_lane_any (__b, __lane);
  }
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vshlq_n_s16 (int16x8_t __a, const int __b)
-+/* vst1_lane */
-+
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
++vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane)
  {
 -  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41713,7 +43597,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshlq_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
++vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane)
  {
 -  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41723,7 +43607,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshlq_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane)
++vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane)
  {
 -  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41733,7 +43617,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshlq_n_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane)
++vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
  {
 -  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41741,9 +43625,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 -vshlq_n_u16 (uint16x8_t __a, const int __b)
++/* vst1q_lane */
++
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
++vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
  {
 -  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41753,7 +43639,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshlq_n_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
++vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
  {
 -  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41763,7 +43649,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshlq_n_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane)
++vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane)
  {
 -  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41773,7 +43659,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshld_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane)
++vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane)
  {
 -  return __builtin_aarch64_ashldi (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41783,7 +43669,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshld_n_u64 (uint64_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane)
++vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
  {
 -  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41793,7 +43679,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_s8 (int8x8_t __a, int8x8_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane)
++vst1q_lane_p64 (poly64_t *__a, poly64x2_t __b, const int __lane)
  {
 -  return __builtin_aarch64_sshlv8qi (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41803,7 +43689,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_s16 (int16x4_t __a, int16x4_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane)
++vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
  {
 -  return __builtin_aarch64_sshlv4hi (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41813,7 +43699,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_s32 (int32x2_t __a, int32x2_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane)
++vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane)
  {
 -  return __builtin_aarch64_sshlv2si (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41823,7 +43709,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_s64 (int64x1_t __a, int64x1_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
++vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane)
  {
 -  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41831,11 +43717,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -vshl_u8 (uint8x8_t __a, int8x8_t __b)
-+/* vst1q_lane */
-+
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
++vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane)
  {
 -  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41845,7 +43729,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_u16 (uint16x4_t __a, int16x4_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
++vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane)
  {
 -  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41855,7 +43739,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_u32 (uint32x2_t __a, int32x2_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane)
++vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane)
  {
 -  return __builtin_aarch64_ushlv2si_uus (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41865,7 +43749,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshl_u64 (uint64x1_t __a, int64x1_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane)
++vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane)
  {
 -  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41875,7 +43759,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -vshlq_s8 (int8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
++vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
  {
 -  return __builtin_aarch64_sshlv16qi (__a, __b);
 +  *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -41883,93 +43767,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 -vshlq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv8hi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshlq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv4si (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshlq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv2di (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vshlq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshlq_u16 (uint16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshlq_u32 (uint32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv4si_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshlq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv2di_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vshld_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshldi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vshld_u64 (uint64_t __a, uint64_t __b)
 +/* vstn */
 +
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_s64 (int64_t * __a, int64x1x2_t val)
  {
--  return __builtin_aarch64_ushldi_uus (__a, __b);
+-  return __builtin_aarch64_sshlv8hi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  int64x2x2_t temp;
 +  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
@@ -41979,13 +43783,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshll_high_n_s8 (int8x16_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vshlq_s32 (int32x4_t __a, int32x4_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_u64 (uint64_t * __a, uint64x1x2_t val)
  {
--  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
+-  return __builtin_aarch64_sshlv4si (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  uint64x2x2_t temp;
 +  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
@@ -41995,13 +43799,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshll_high_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vshlq_s64 (int64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_f64 (float64_t * __a, float64x1x2_t val)
  {
--  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
+-  return __builtin_aarch64_sshlv2di (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  float64x2x2_t temp;
 +  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
@@ -42011,13 +43815,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshll_high_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vshlq_u8 (uint8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_s8 (int8_t * __a, int8x8x2_t val)
  {
--  return __builtin_aarch64_sshll2_nv4si (__a, __b);
+-  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  int8x16x2_t temp;
 +  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
@@ -42028,12 +43832,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshll_high_n_u8 (uint8x16_t __a, const int __b)
+-vshlq_u16 (uint16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_p8 (poly8_t * __a, poly8x8x2_t val)
  {
--  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
+-  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  poly8x16x2_t temp;
 +  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
@@ -42044,12 +43848,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshll_high_n_u16 (uint16x8_t __a, const int __b)
+-vshlq_u32 (uint32x4_t __a, int32x4_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_s16 (int16_t * __a, int16x4x2_t val)
  {
--  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
+-  return __builtin_aarch64_ushlv4si_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  int16x8x2_t temp;
 +  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
@@ -42060,12 +43864,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshll_high_n_u32 (uint32x4_t __a, const int __b)
+-vshlq_u64 (uint64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_p16 (poly16_t * __a, poly16x4x2_t val)
  {
--  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
+-  return __builtin_aarch64_ushlv2di_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  poly16x8x2_t temp;
 +  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
@@ -42075,13 +43879,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshll_n_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vshld_s64 (int64_t __a, int64_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_s32 (int32_t * __a, int32x2x2_t val)
  {
--  return __builtin_aarch64_sshll_nv8qi (__a, __b);
+-  return __builtin_aarch64_sshldi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  int32x4x2_t temp;
 +  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
@@ -42091,13 +43895,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshll_n_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vshld_u64 (uint64_t __a, uint64_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_u8 (uint8_t * __a, uint8x8x2_t val)
  {
--  return __builtin_aarch64_sshll_nv4hi (__a, __b);
+-  return __builtin_aarch64_ushldi_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  uint8x16x2_t temp;
 +  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
@@ -42107,13 +43911,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshll_n_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vshll_high_n_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_u16 (uint16_t * __a, uint16x4x2_t val)
  {
--  return __builtin_aarch64_sshll_nv2si (__a, __b);
+-  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  uint16x8x2_t temp;
 +  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
@@ -42123,13 +43927,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshll_n_u8 (uint8x8_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vshll_high_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_u32 (uint32_t * __a, uint32x2x2_t val)
  {
--  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
+-  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  uint32x4x2_t temp;
 +  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
@@ -42139,13 +43943,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshll_n_u16 (uint16x4_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vshll_high_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_f16 (float16_t * __a, float16x4x2_t val)
  {
--  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
+-  return __builtin_aarch64_sshll2_nv4si (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  float16x8x2_t temp;
 +  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
@@ -42155,13 +43959,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v4hf (__a, __o);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshll_n_u32 (uint32x2_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vshll_high_n_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2_f32 (float32_t * __a, float32x2x2_t val)
  {
--  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
+-  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  float32x4x2_t temp;
 +  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
@@ -42171,184 +43975,217 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
  }
  
--/* vshr */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vshr_n_s8 (int8x8_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vshll_high_n_u16 (uint16x8_t __a, const int __b)
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_p64 (poly64_t * __a, poly64x1x2_t val)
+ {
+-  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
++  __builtin_aarch64_simd_oi __o;
++  poly64x2x2_t temp;
++  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) temp.val[1], 1);
++  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vshll_high_n_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_s8 (int8_t * __a, int8x16x2_t val)
  {
--  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
+-  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
 +  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vshr_n_s16 (int16x4_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vshll_n_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
  {
--  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
+-  return __builtin_aarch64_sshll_nv8qi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
 +  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vshr_n_s32 (int32x2_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vshll_n_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_s16 (int16_t * __a, int16x8x2_t val)
  {
--  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
+-  return __builtin_aarch64_sshll_nv4hi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
 +  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vshr_n_s64 (int64x1_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vshll_n_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
  {
--  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
+-  return __builtin_aarch64_sshll_nv2si (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
 +  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vshr_n_u8 (uint8x8_t __a, const int __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vshll_n_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_s32 (int32_t * __a, int32x4x2_t val)
  {
--  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
+-  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
 +  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vshr_n_u16 (uint16x4_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vshll_n_u16 (uint16x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_s64 (int64_t * __a, int64x2x2_t val)
  {
--  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
+-  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
 +  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vshr_n_u32 (uint32x2_t __a, const int __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vshll_n_u32 (uint32x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
  {
--  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
+-  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
 +  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vshr_n_u64 (uint64x1_t __a, const int __b)
+-/* vshr */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vshr_n_s8 (int8x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
  {
--  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
+-  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
 +  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vshrq_n_s8 (int8x16_t __a, const int __b)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vshr_n_s16 (int16x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
  {
--  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
+-  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
 +  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshrq_n_s16 (int16x8_t __a, const int __b)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vshr_n_s32 (int32x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
  {
--  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
+-  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
 +  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshrq_n_s32 (int32x4_t __a, const int __b)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vshr_n_s64 (int64x1_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_f16 (float16_t * __a, float16x8x2_t val)
  {
--  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
+-  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
 +  __builtin_aarch64_st2v8hf (__a, __o);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshrq_n_s64 (int64x2_t __a, const int __b)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vshr_n_u8 (uint8x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_f32 (float32_t * __a, float32x4x2_t val)
  {
--  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
+-  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
 +  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vshrq_n_u8 (uint8x16_t __a, const int __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vshr_n_u16 (uint16x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst2q_f64 (float64_t * __a, float64x2x2_t val)
  {
--  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
+-  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
 +  __builtin_aarch64_simd_oi __o;
 +  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
 +  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshrq_n_u16 (uint16x8_t __a, const int __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vshr_n_u32 (uint32x2_t __a, const int __b)
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_p64 (poly64_t * __a, poly64x2x2_t val)
+ {
+-  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) val.val[1], 1);
++  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vshr_n_u64 (uint64x1_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_s64 (int64_t * __a, int64x1x3_t val)
  {
--  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
+-  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
 +  __builtin_aarch64_simd_ci __o;
 +  int64x2x3_t temp;
 +  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
@@ -42360,13 +44197,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshrq_n_u32 (uint32x4_t __a, const int __b)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vshrq_n_s8 (int8x16_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_u64 (uint64_t * __a, uint64x1x3_t val)
  {
--  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
+-  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  uint64x2x3_t temp;
 +  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
@@ -42378,13 +44215,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshrq_n_u64 (uint64x2_t __a, const int __b)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vshrq_n_s16 (int16x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_f64 (float64_t * __a, float64x1x3_t val)
  {
--  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
+-  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  float64x2x3_t temp;
 +  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
@@ -42396,13 +44233,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vshrd_n_s64 (int64_t __a, const int __b)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vshrq_n_s32 (int32x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_s8 (int8_t * __a, int8x8x3_t val)
  {
--  return __builtin_aarch64_ashr_simddi (__a, __b);
+-  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  int8x16x3_t temp;
 +  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
@@ -42414,13 +44251,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vshrd_n_u64 (uint64_t __a, const int __b)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vshrq_n_s64 (int64x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_p8 (poly8_t * __a, poly8x8x3_t val)
  {
--  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
+-  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  poly8x16x3_t temp;
 +  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
@@ -42432,15 +44269,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--/* vsli */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vshrq_n_u8 (uint8x16_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_s16 (int16_t * __a, int16x4x3_t val)
  {
--  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
+-  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  int16x8x3_t temp;
 +  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
@@ -42452,13 +44287,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vshrq_n_u16 (uint16x8_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_p16 (poly16_t * __a, poly16x4x3_t val)
  {
--  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
+-  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  poly16x8x3_t temp;
 +  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
@@ -42470,13 +44305,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vshrq_n_u32 (uint32x4_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_s32 (int32_t * __a, int32x2x3_t val)
  {
--  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
+-  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  int32x4x3_t temp;
 +  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
@@ -42488,13 +44323,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vshrq_n_u64 (uint64x2_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_u8 (uint8_t * __a, uint8x8x3_t val)
  {
--  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
+-  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  uint8x16x3_t temp;
 +  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
@@ -42506,13 +44341,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vshrd_n_s64 (int64_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_u16 (uint16_t * __a, uint16x4x3_t val)
  {
--  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
+-  return __builtin_aarch64_ashr_simddi (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  uint16x8x3_t temp;
 +  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
@@ -42524,13 +44359,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vshrd_n_u64 (uint64_t __a, const int __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_u32 (uint32_t * __a, uint32x2x3_t val)
  {
--  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
+-  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
 +  __builtin_aarch64_simd_ci __o;
 +  uint32x4x3_t temp;
 +  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
@@ -42542,13 +44377,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+-/* vsli */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_f16 (float16_t * __a, float16x4x3_t val)
  {
--  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
+-  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  float16x8x3_t temp;
 +  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
@@ -42560,13 +44397,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3_f32 (float32_t * __a, float32x2x3_t val)
  {
--  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
+-  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  float32x4x3_t temp;
 +  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
@@ -42578,13 +44415,34 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst3_p64 (poly64_t * __a, poly64x1x3_t val)
+ {
+-  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
++  __builtin_aarch64_simd_ci __o;
++  poly64x2x3_t temp;
++  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
++					       (poly64x2_t) temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
++					       (poly64x2_t) temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
++					       (poly64x2_t) temp.val[2], 2);
++  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_s8 (int8_t * __a, int8x16x3_t val)
  {
--  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
+-  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
@@ -42592,13 +44450,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
  {
--  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
+-  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
@@ -42606,13 +44464,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_s16 (int16_t * __a, int16x8x3_t val)
  {
--  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
+-  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
@@ -42620,13 +44478,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
  {
--  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
+-  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
@@ -42634,13 +44492,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_s32 (int32_t * __a, int32x4x3_t val)
  {
--  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
+-  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
@@ -42648,13 +44506,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_s64 (int64_t * __a, int64x2x3_t val)
  {
--  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
+-  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
@@ -42662,13 +44520,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
  {
--  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
+-  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
@@ -42676,13 +44534,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
  {
--  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
+-  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
@@ -42690,13 +44548,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
  {
--  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
+-  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
@@ -42704,13 +44562,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
  {
--  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
+-  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
@@ -42718,15 +44576,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--/* vsqadd */
--
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_f16 (float16_t * __a, float16x8x3_t val)
  {
--  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
+-  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
@@ -42734,13 +44590,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_f32 (float32_t * __a, float32x4x3_t val)
  {
--  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
+-  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
@@ -42748,13 +44604,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst3q_f64 (float64_t * __a, float64x2x3_t val)
  {
--  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
+-  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_ci __o;
 +  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
@@ -42762,13 +44618,30 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+-vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst3q_p64 (poly64_t * __a, poly64x2x3_t val)
+ {
+-  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
++					       (poly64x2_t) val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
++					       (poly64x2_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
++					       (poly64x2_t) val.val[2], 2);
++  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_s64 (int64_t * __a, int64x1x4_t val)
  {
--  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
+-  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  int64x2x4_t temp;
 +  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
@@ -42782,13 +44655,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
+-/* vsqadd */
+-
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_u64 (uint64_t * __a, uint64x1x4_t val)
  {
--  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  uint64x2x4_t temp;
 +  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
@@ -42802,13 +44677,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_f64 (float64_t * __a, float64x1x4_t val)
  {
--  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  float64x2x4_t temp;
 +  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
@@ -42822,13 +44697,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_s8 (int8_t * __a, int8x8x4_t val)
  {
--  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  int8x16x4_t temp;
 +  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
@@ -42842,13 +44717,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_p8 (poly8_t * __a, poly8x8x4_t val)
  {
--  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
+-  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
 +  __builtin_aarch64_simd_xi __o;
 +  poly8x16x4_t temp;
 +  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
@@ -42862,13 +44737,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vsqaddb_u8 (uint8_t __a, int8_t __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_s16 (int16_t * __a, int16x4x4_t val)
  {
--  return __builtin_aarch64_usqaddqi_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  int16x8x4_t temp;
 +  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
@@ -42882,13 +44757,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vsqaddh_u16 (uint16_t __a, int16_t __b)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_p16 (poly16_t * __a, poly16x4x4_t val)
  {
--  return __builtin_aarch64_usqaddhi_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  poly16x8x4_t temp;
 +  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
@@ -42902,13 +44777,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vsqadds_u32 (uint32_t __a, int32_t __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_s32 (int32_t * __a, int32x2x4_t val)
  {
--  return __builtin_aarch64_usqaddsi_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  int32x4x4_t temp;
 +  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
@@ -42922,13 +44797,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vsqaddd_u64 (uint64_t __a, int64_t __b)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_u8 (uint8_t * __a, uint8x8x4_t val)
  {
--  return __builtin_aarch64_usqadddi_uus (__a, __b);
+-  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  uint8x16x4_t temp;
 +  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
@@ -42942,14 +44817,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--/* vsqrt */
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vsqrt_f32 (float32x2_t a)
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+-vsqaddb_u8 (uint8_t __a, int8_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_u16 (uint16_t * __a, uint16x4x4_t val)
  {
--  return __builtin_aarch64_sqrtv2sf (a);
+-  return __builtin_aarch64_usqaddqi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  uint16x8x4_t temp;
 +  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
@@ -42963,13 +44837,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vsqrtq_f32 (float32x4_t a)
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+-vsqaddh_u16 (uint16_t __a, int16_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_u32 (uint32_t * __a, uint32x2x4_t val)
  {
--  return __builtin_aarch64_sqrtv4sf (a);
+-  return __builtin_aarch64_usqaddhi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  uint32x4x4_t temp;
 +  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
@@ -42983,13 +44857,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vsqrt_f64 (float64x1_t a)
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+-vsqadds_u32 (uint32_t __a, int32_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_f16 (float16_t * __a, float16x4x4_t val)
  {
--  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
+-  return __builtin_aarch64_usqaddsi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  float16x8x4_t temp;
 +  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
@@ -43003,13 +44877,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
  }
  
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vsqrtq_f64 (float64x2_t a)
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+-vsqaddd_u64 (uint64_t __a, int64_t __b)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4_f32 (float32_t * __a, float32x2x4_t val)
  {
--  return __builtin_aarch64_sqrtv2df (a);
+-  return __builtin_aarch64_usqadddi_uus (__a, __b);
 +  __builtin_aarch64_simd_xi __o;
 +  float32x4x4_t temp;
 +  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
@@ -43023,15 +44897,38 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
  }
  
--/* vsra */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+-/* vsqrt */
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vsqrt_f32 (float32x2_t a)
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst4_p64 (poly64_t * __a, poly64x1x4_t val)
+ {
+-  return __builtin_aarch64_sqrtv2sf (a);
++  __builtin_aarch64_simd_xi __o;
++  poly64x2x4_t temp;
++  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) temp.val[3], 3);
++  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vsqrtq_f32 (float32x4_t a)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_s8 (int8_t * __a, int8x16x4_t val)
  {
--  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
+-  return __builtin_aarch64_sqrtv4sf (a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
@@ -43040,13 +44937,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vsqrt_f64 (float64x1_t a)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
  {
--  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
+-  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
@@ -43055,13 +44952,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vsqrtq_f64 (float64x2_t a)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_s16 (int16_t * __a, int16x8x4_t val)
  {
--  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
+-  return __builtin_aarch64_sqrtv2df (a);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
@@ -43070,13 +44967,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+-/* vsra */
+-
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+-vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
  {
--  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
+-  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
@@ -43085,13 +44984,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_s32 (int32_t * __a, int32x4x4_t val)
  {
--  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
+-  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
@@ -43100,13 +44999,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_s64 (int64_t * __a, int64x2x4_t val)
  {
--  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
+-  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
@@ -43115,13 +45014,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+-vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
  {
--  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
+-  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
@@ -43130,13 +45029,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
  }
  
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
  {
--  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
+-  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
@@ -43145,13 +45044,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
  }
  
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
  {
--  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
+-  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
@@ -43160,13 +45059,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
  }
  
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
  {
--  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
+-  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
@@ -43175,13 +45074,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
  }
  
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+-vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_f16 (float16_t * __a, float16x8x4_t val)
  {
--  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
+-  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
@@ -43190,13 +45089,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
  }
  
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_f32 (float32_t * __a, float32x4x4_t val)
  {
--  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
+-  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
@@ -43205,13 +45104,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
  }
  
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
 +__extension__ extern __inline void
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vst4q_f64 (float64_t * __a, float64x2x4_t val)
  {
--  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
+-  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
 +  __builtin_aarch64_simd_xi __o;
 +  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
 +  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
@@ -43220,32 +45119,61 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
  }
  
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst4q_p64 (poly64_t * __a, poly64x2x4_t val)
+ {
+-  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
++					       (poly64x2_t) val.val[3], 3);
++  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+-vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+-{
+-  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
+-}
 +/* vsub */
-+
+ 
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
 +__extension__ extern __inline int64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vsubd_s64 (int64_t __a, int64_t __b)
  {
--  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
+-  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
 +  return __a - __b;
  }
  
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
 +__extension__ extern __inline uint64_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vsubd_u64 (uint64_t __a, uint64_t __b)
  {
--  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
+-  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
 +  return __a - __b;
  }
  
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+-{
+-  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
+-}
++/* vtbx1  */
+ 
 -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 -vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-+/* vtbx1  */
-+
 +__extension__ extern __inline int8x8_t
 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 +vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
@@ -46483,7 +48411,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  {
  #ifdef __AARCH64EB__
    return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-@@ -24455,1319 +29209,1184 @@ vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
+@@ -24455,1319 +30368,1184 @@ vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
  #endif
  }
  
@@ -48544,7 +50472,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef __aarch64_vdup_lane_f32
  #undef __aarch64_vdup_lane_f64
  #undef __aarch64_vdup_lane_p8
-@@ -25780,6 +30399,7 @@ __INTERLEAVE_LIST (zip)
+@@ -25780,6 +31558,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdup_lane_u16
  #undef __aarch64_vdup_lane_u32
  #undef __aarch64_vdup_lane_u64
@@ -48552,7 +50480,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef __aarch64_vdup_laneq_f32
  #undef __aarch64_vdup_laneq_f64
  #undef __aarch64_vdup_laneq_p8
-@@ -25792,6 +30412,7 @@ __INTERLEAVE_LIST (zip)
+@@ -25792,6 +31571,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdup_laneq_u16
  #undef __aarch64_vdup_laneq_u32
  #undef __aarch64_vdup_laneq_u64
@@ -48560,7 +50488,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef __aarch64_vdupq_lane_f32
  #undef __aarch64_vdupq_lane_f64
  #undef __aarch64_vdupq_lane_p8
-@@ -25804,6 +30425,7 @@ __INTERLEAVE_LIST (zip)
+@@ -25804,6 +31584,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdupq_lane_u16
  #undef __aarch64_vdupq_lane_u32
  #undef __aarch64_vdupq_lane_u64
@@ -48568,7 +50496,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef __aarch64_vdupq_laneq_f32
  #undef __aarch64_vdupq_laneq_f64
  #undef __aarch64_vdupq_laneq_p8
-@@ -25817,6 +30439,4 @@ __INTERLEAVE_LIST (zip)
+@@ -25817,6 +31598,4 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdupq_laneq_u32
  #undef __aarch64_vdupq_laneq_u64
  
@@ -48939,6 +50867,51 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
  			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
  			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
+--- a/src/gcc/config/aarch64/t-aarch64
++++ b/src/gcc/config/aarch64/t-aarch64
+@@ -52,7 +52,7 @@ aarch-common.o: $(srcdir)/config/arm/aarch-common.c $(CONFIG_H) $(SYSTEM_H) \
+ 		$(srcdir)/config/arm/aarch-common.c
+ 
+ aarch64-c.o: $(srcdir)/config/aarch64/aarch64-c.c $(CONFIG_H) $(SYSTEM_H) \
+-    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H)
++    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H) $(TARGET_H)
+ 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ 		$(srcdir)/config/aarch64/aarch64-c.c
+ 
+--- /dev/null
++++ b/src/gcc/config/aarch64/t-aarch64-freebsd
+@@ -0,0 +1,21 @@
++# Machine description for AArch64 architecture.
++#  Copyright (C) 2016 Free Software Foundation, Inc.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.
++
++LIB1ASMSRC   = aarch64/lib1funcs.asm
++LIB1ASMFUNCS = _aarch64_sync_cache_range
+--- a/src/gcc/config/alpha/alpha.c
++++ b/src/gcc/config/alpha/alpha.c
+@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "df.h"
+ #include "tm_p.h"
 --- a/src/gcc/config/arm/aarch-cost-tables.h
 +++ b/src/gcc/config/arm/aarch-cost-tables.h
 @@ -191,35 +191,35 @@ const struct cpu_cost_table cortexa53_extra_costs =
@@ -49191,12 +51164,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +ARM_ARCH ("armv8.2-a+fp16", cortexa53,  8A,
 +	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
 +			 FL2_FOR_ARCH8_2A | FL2_FP16INST))
-+ARM_ARCH("armv8-m.base", cortexm0, 8M_BASE,
-+	 ARM_FSET_MAKE_CPU1 (			      FL_FOR_ARCH8M_BASE))
++ARM_ARCH("armv8-m.base", cortexm23, 8M_BASE,
++	  ARM_FSET_MAKE (FL_FOR_ARCH8M_BASE, FL2_CMSE))
 +ARM_ARCH("armv8-m.main", cortexm7, 8M_MAIN,
-+	 ARM_FSET_MAKE_CPU1(FL_CO_PROC |	      FL_FOR_ARCH8M_MAIN))
-+ARM_ARCH("armv8-m.main+dsp", cortexm7, 8M_MAIN,
-+	 ARM_FSET_MAKE_CPU1(FL_CO_PROC | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN))
++	  ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8M_MAIN, FL2_CMSE))
++ARM_ARCH("armv8-m.main+dsp", cortexm33, 8M_MAIN,
++	  ARM_FSET_MAKE (FL_CO_PROC | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN, FL2_CMSE))
  ARM_ARCH("iwmmxt",  iwmmxt,     5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT))
  ARM_ARCH("iwmmxt2", iwmmxt2,    5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT | FL_IWMMXT2))
 -
@@ -49223,15 +51196,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -   that instruction's pattern in neon.md.  */
 +/* The NEON builtin data can be found in arm_neon_builtins.def and
 +   arm_vfp_builtins.def.  The entries in arm_neon_builtins.def require
-+   TARGET_NEON to be true.  The entries in arm_vfp_builtins.def require
-+   TARGET_VFP to be true.  The feature tests are checked when the builtins are
-+   expanded.
++   TARGET_NEON to be true.  The feature tests are checked when the
++   builtins are expanded.
 +
-+   The mode entries in the following table correspond to
-+   the "key" type of the instruction variant, i.e. equivalent to that which
-+   would be specified after the assembler mnemonic, which usually refers to the
-+   last vector operand.  The modes listed per instruction should be the same as
-+   those defined for that instruction's pattern in neon.md.  */
++   The mode entries in the following table correspond to the "key"
++   type of the instruction variant, i.e. equivalent to that which
++   would be specified after the assembler mnemonic, which usually
++   refers to the last vector operand.  The modes listed per
++   instruction should be the same as those defined for that
++   instruction's pattern in neon.md.  */
 +
 +static neon_builtin_datum vfp_builtin_data[] =
 +{
@@ -49240,7 +51213,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  static neon_builtin_datum neon_builtin_data[] =
  {
-@@ -534,6 +546,10 @@ enum arm_builtins
+@@ -515,6 +527,8 @@ enum arm_builtins
+   ARM_BUILTIN_GET_FPSCR,
+   ARM_BUILTIN_SET_FPSCR,
+ 
++  ARM_BUILTIN_CMSE_NONSECURE_CALLER,
++
+ #undef CRYPTO1
+ #undef CRYPTO2
+ #undef CRYPTO3
+@@ -534,6 +548,10 @@ enum arm_builtins
  #undef CRYPTO2
  #undef CRYPTO3
  
@@ -49251,7 +51233,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    ARM_BUILTIN_NEON_BASE,
    ARM_BUILTIN_NEON_LANE_CHECK = ARM_BUILTIN_NEON_BASE,
  
-@@ -542,8 +558,11 @@ enum arm_builtins
+@@ -542,8 +560,11 @@ enum arm_builtins
    ARM_BUILTIN_MAX
  };
  
@@ -49264,7 +51246,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  #undef CF
  #undef VAR1
-@@ -895,6 +914,110 @@ arm_init_simd_builtin_scalar_types (void)
+@@ -895,6 +916,110 @@ arm_init_simd_builtin_scalar_types (void)
  					     "__builtin_neon_uti");
  }
  
@@ -49375,7 +51357,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Set up all the NEON builtins, even builtins for instructions that are not
     in the current target ISA to allow the user to compile particular modules
     with different target specific options that differ from the command line
-@@ -924,103 +1047,22 @@ arm_init_neon_builtins (void)
+@@ -924,103 +1049,22 @@ arm_init_neon_builtins (void)
  
    for (i = 0; i < ARRAY_SIZE (neon_builtin_data); i++, fcode++)
      {
@@ -49491,7 +51473,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      }
  }
  
-@@ -1768,7 +1810,7 @@ arm_init_builtins (void)
+@@ -1768,14 +1812,14 @@ arm_init_builtins (void)
    if (TARGET_HARD_FLOAT)
      {
        arm_init_neon_builtins ();
@@ -49500,7 +51482,33 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        arm_init_crypto_builtins ();
      }
  
-@@ -2211,40 +2253,16 @@ constant_arg:
+   if (TARGET_CRC32)
+     arm_init_crc32_builtins ();
+ 
+-  if (TARGET_VFP && TARGET_HARD_FLOAT)
++  if (TARGET_HARD_FLOAT)
+     {
+       tree ftype_set_fpscr
+ 	= build_function_type_list (void_type_node, unsigned_type_node, NULL);
+@@ -1789,6 +1833,17 @@ arm_init_builtins (void)
+ 	= add_builtin_function ("__builtin_arm_stfscr", ftype_set_fpscr,
+ 				ARM_BUILTIN_SET_FPSCR, BUILT_IN_MD, NULL, NULL_TREE);
+     }
++
++  if (use_cmse)
++    {
++      tree ftype_cmse_nonsecure_caller
++	= build_function_type_list (unsigned_type_node, NULL);
++      arm_builtin_decls[ARM_BUILTIN_CMSE_NONSECURE_CALLER]
++	= add_builtin_function ("__builtin_arm_cmse_nonsecure_caller",
++				ftype_cmse_nonsecure_caller,
++				ARM_BUILTIN_CMSE_NONSECURE_CALLER, BUILT_IN_MD,
++				NULL, NULL_TREE);
++    }
+ }
+ 
+ /* Return the ARM builtin for CODE.  */
+@@ -2211,40 +2266,16 @@ constant_arg:
    return target;
  }
  
@@ -49549,7 +51557,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    enum insn_code icode = d->code;
    builtin_arg args[SIMD_MAX_BUILTIN_ARGS + 1];
    int num_args = insn_data[d->code].n_operands;
-@@ -2260,8 +2278,8 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
+@@ -2260,8 +2291,8 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
        /* We have four arrays of data, each indexed in a different fashion.
  	 qualifiers - element 0 always describes the function return type.
  	 operands - element 0 is either the operand for return value (if
@@ -49560,7 +51568,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	 expr_args - element 0 always holds the first argument.
  	 args - element 0 is always used for the return type.  */
        int qualifiers_k = k;
-@@ -2283,7 +2301,7 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
+@@ -2283,7 +2314,7 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
  	  bool op_const_int_p =
  	    (CONST_INT_P (arg)
  	     && (*insn_data[icode].operand[operands_k].predicate)
@@ -49569,7 +51577,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  args[k] = op_const_int_p ? NEON_ARG_CONSTANT : NEON_ARG_COPY_TO_REG;
  	}
        else if (d->qualifiers[qualifiers_k] & qualifier_pointer)
-@@ -2296,8 +2314,68 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
+@@ -2296,8 +2327,68 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
    /* The interface to arm_expand_neon_args expects a 0 if
       the function is void, and a 1 if it is not.  */
    return arm_expand_neon_args
@@ -49618,14 +51626,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  return arm_expand_neon_builtin_1 (fcode, exp, target, d);
 +}
 +
-+/* Expand a VFP builtin, if TARGET_VFP is true.  These builtins are treated like
++/* Expand a VFP builtin.  These builtins are treated like
 +   neon builtins except that the data is looked up in table
 +   VFP_BUILTIN_DATA.  */
 +
 +static rtx
 +arm_expand_vfp_builtin (int fcode, tree exp, rtx target)
 +{
-+  if (fcode >= ARM_BUILTIN_VFP_BASE && ! TARGET_VFP)
++  if (fcode >= ARM_BUILTIN_VFP_BASE && ! TARGET_HARD_FLOAT)
 +    {
 +      fatal_error (input_location,
 +		   "You must enable VFP instructions"
@@ -49640,7 +51648,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
  /* Expand an expression EXP that calls a built-in function,
-@@ -2337,13 +2415,18 @@ arm_expand_builtin (tree exp,
+@@ -2337,13 +2428,18 @@ arm_expand_builtin (tree exp,
    if (fcode >= ARM_BUILTIN_NEON_BASE)
      return arm_expand_neon_builtin (fcode, exp, target);
  
@@ -49660,9 +51668,64 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        return const0_rtx;
      }
  
+@@ -2368,6 +2464,12 @@ arm_expand_builtin (tree exp,
+       emit_insn (pat);
+       return target;
+ 
++    case ARM_BUILTIN_CMSE_NONSECURE_CALLER:
++      target = gen_reg_rtx (SImode);
++      op0 = arm_return_addr (0, NULL_RTX);
++      emit_insn (gen_addsi3 (target, op0, const1_rtx));
++      return target;
++
+     case ARM_BUILTIN_TEXTRMSB:
+     case ARM_BUILTIN_TEXTRMUB:
+     case ARM_BUILTIN_TEXTRMSH:
+@@ -2995,7 +3097,7 @@ arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
+   tree new_fenv_var, reload_fenv, restore_fnenv;
+   tree update_call, atomic_feraiseexcept, hold_fnclex;
+ 
+-  if (!TARGET_VFP || !TARGET_HARD_FLOAT)
++  if (!TARGET_HARD_FLOAT)
+     return;
+ 
+   /* Generate the equivalent of :
 --- a/src/gcc/config/arm/arm-c.c
 +++ b/src/gcc/config/arm/arm-c.c
-@@ -135,10 +135,17 @@ arm_cpu_builtins (struct cpp_reader* pfile)
+@@ -76,6 +76,14 @@ arm_cpu_builtins (struct cpp_reader* pfile)
+ 
+   def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT);
+ 
++  if (arm_arch8 && !arm_arch_notm)
++    {
++      if (arm_arch_cmse && use_cmse)
++	builtin_define_with_int_value ("__ARM_FEATURE_CMSE", 3);
++      else
++	builtin_define ("__ARM_FEATURE_CMSE");
++    }
++
+   if (TARGET_ARM_FEATURE_LDREX)
+     builtin_define_with_int_value ("__ARM_FEATURE_LDREX",
+ 				   TARGET_ARM_FEATURE_LDREX);
+@@ -86,6 +94,9 @@ arm_cpu_builtins (struct cpp_reader* pfile)
+ 		      ((TARGET_ARM_ARCH >= 5 && !TARGET_THUMB)
+ 		       || TARGET_ARM_ARCH_ISA_THUMB >=2));
+ 
++  def_or_undef_macro (pfile, "__ARM_FEATURE_NUMERIC_MAXMIN",
++		      TARGET_ARM_ARCH >= 8 && TARGET_NEON && TARGET_FPU_ARMV8);
++
+   def_or_undef_macro (pfile, "__ARM_FEATURE_SIMD32", TARGET_INT_SIMD);
+ 
+   builtin_define_with_int_value ("__ARM_SIZEOF_MINIMAL_ENUM",
+@@ -128,17 +139,24 @@ arm_cpu_builtins (struct cpp_reader* pfile)
+   if (TARGET_SOFT_FLOAT)
+     builtin_define ("__SOFTFP__");
+ 
+-  def_or_undef_macro (pfile, "__VFP_FP__", TARGET_VFP);
++  builtin_define ("__VFP_FP__");
+ 
+   if (TARGET_ARM_FP)
+     builtin_define_with_int_value ("__ARM_FP", TARGET_ARM_FP);
    else
      cpp_undef (pfile, "__ARM_FP");
  
@@ -49686,7 +51749,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON);
 --- a/src/gcc/config/arm/arm-cores.def
 +++ b/src/gcc/config/arm/arm-cores.def
-@@ -171,10 +171,14 @@ ARM_CORE("cortex-a35",	cortexa35, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED
+@@ -166,15 +166,21 @@ ARM_CORE("cortex-a15.cortex-a7", cortexa15cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
+ ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_THUMB_DIV | FL_ARM_DIV | FL_FOR_ARCH7A), cortex_a12)
+ 
+ /* V8 Architecture Processors */
++ARM_CORE("cortex-m23",	cortexm23, cortexm23,	8M_BASE, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8M_BASE), v6m)
+ ARM_CORE("cortex-a32",	cortexa32, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a35)
++ARM_CORE("cortex-m33",	cortexm33, cortexm33,	8M_MAIN, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN), v7m)
+ ARM_CORE("cortex-a35",	cortexa35, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a35)
  ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
  ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
  ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
@@ -49702,6 +51772,274 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +ARM_CORE("cortex-a73.cortex-a35", cortexa73cortexa35, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
 +ARM_CORE("cortex-a73.cortex-a53", cortexa73cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
 +
+--- /dev/null
++++ b/src/gcc/config/arm/arm-flags.h
+@@ -0,0 +1,212 @@
++/* Flags used to identify the presence of processor capabilities.
++
++   Copyright (C) 2016 Free Software Foundation, Inc.
++   Contributed by ARM Ltd.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published
++   by the Free Software Foundation; either version 3, or (at your
++   option) any later version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_ARM_FLAGS_H
++#define GCC_ARM_FLAGS_H
++
++/* Flags used to identify the presence of processor capabilities.  */
++
++/* Bit values used to identify processor capabilities.  */
++#define FL_NONE	      (0U)		/* No flags.  */
++#define FL_ANY	      (0xffffffffU)	/* All flags.  */
++#define FL_CO_PROC    (1U << 0)		/* Has external co-processor bus.  */
++#define FL_ARCH3M     (1U << 1)		/* Extended multiply.  */
++#define FL_MODE26     (1U << 2)		/* 26-bit mode support.  */
++#define FL_MODE32     (1U << 3)		/* 32-bit mode support.  */
++#define FL_ARCH4      (1U << 4)		/* Architecture rel 4.  */
++#define FL_ARCH5      (1U << 5)		/* Architecture rel 5.  */
++#define FL_THUMB      (1U << 6)		/* Thumb aware.  */
++#define FL_LDSCHED    (1U << 7)		/* Load scheduling necessary.  */
++#define FL_STRONG     (1U << 8)		/* StrongARM.  */
++#define FL_ARCH5E     (1U << 9)		/* DSP extensions to v5.  */
++#define FL_XSCALE     (1U << 10)	/* XScale.  */
++/* spare	      (1U << 11) */
++#define FL_ARCH6      (1U << 12)	/* Architecture rel 6.  Adds
++					   media instructions.  */
++#define FL_VFPV2      (1U << 13)	/* Vector Floating Point V2.  */
++#define FL_WBUF	      (1U << 14)	/* Schedule for write buffer ops.
++					   Note: ARM6 & 7 derivatives only.  */
++#define FL_ARCH6K     (1U << 15)	/* Architecture rel 6 K extensions.  */
++#define FL_THUMB2     (1U << 16)	/* Thumb-2.  */
++#define FL_NOTM	      (1U << 17)	/* Instructions not present in the 'M'
++					   profile.  */
++#define FL_THUMB_DIV  (1U << 18)	/* Hardware divide (Thumb mode).  */
++#define FL_VFPV3      (1U << 19)	/* Vector Floating Point V3.  */
++#define FL_NEON       (1U << 20)	/* Neon instructions.  */
++#define FL_ARCH7EM    (1U << 21)	/* Instructions present in the ARMv7E-M
++					   architecture.  */
++#define FL_ARCH7      (1U << 22)	/* Architecture 7.  */
++#define FL_ARM_DIV    (1U << 23)	/* Hardware divide (ARM mode).  */
++#define FL_ARCH8      (1U << 24)	/* Architecture 8.  */
++#define FL_CRC32      (1U << 25)	/* ARMv8 CRC32 instructions.  */
++#define FL_SMALLMUL   (1U << 26)	/* Small multiply supported.  */
++#define FL_NO_VOLATILE_CE  (1U << 27)	/* No volatile memory in IT block.  */
++
++#define FL_IWMMXT     (1U << 29)	/* XScale v2 or "Intel Wireless MMX
++					   technology".  */
++#define FL_IWMMXT2    (1U << 30)	/* "Intel Wireless MMX2
++					    technology".  */
++#define FL_ARCH6KZ    (1U << 31)	/* ARMv6KZ architecture.  */
++
++#define FL2_ARCH8_1   (1U << 0)		/* Architecture 8.1.  */
++#define FL2_ARCH8_2   (1U << 1)		/* Architecture 8.2.  */
++#define FL2_FP16INST  (1U << 2)		/* FP16 Instructions for ARMv8.2 and
++					   later.  */
++#define FL2_CMSE      (1U << 3)		/* ARMv8-M Security Extensions.  */
++
++/* Flags that only effect tuning, not available instructions.  */
++#define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
++			 | FL_CO_PROC)
++
++#define FL_FOR_ARCH2		FL_NOTM
++#define FL_FOR_ARCH3		(FL_FOR_ARCH2 | FL_MODE32)
++#define FL_FOR_ARCH3M		(FL_FOR_ARCH3 | FL_ARCH3M)
++#define FL_FOR_ARCH4		(FL_FOR_ARCH3M | FL_ARCH4)
++#define FL_FOR_ARCH4T		(FL_FOR_ARCH4 | FL_THUMB)
++#define FL_FOR_ARCH5		(FL_FOR_ARCH4 | FL_ARCH5)
++#define FL_FOR_ARCH5T		(FL_FOR_ARCH5 | FL_THUMB)
++#define FL_FOR_ARCH5E		(FL_FOR_ARCH5 | FL_ARCH5E)
++#define FL_FOR_ARCH5TE		(FL_FOR_ARCH5E | FL_THUMB)
++#define FL_FOR_ARCH5TEJ		FL_FOR_ARCH5TE
++#define FL_FOR_ARCH6		(FL_FOR_ARCH5TE | FL_ARCH6)
++#define FL_FOR_ARCH6J		FL_FOR_ARCH6
++#define FL_FOR_ARCH6K		(FL_FOR_ARCH6 | FL_ARCH6K)
++#define FL_FOR_ARCH6Z		FL_FOR_ARCH6
++#define FL_FOR_ARCH6ZK		FL_FOR_ARCH6K
++#define FL_FOR_ARCH6KZ		(FL_FOR_ARCH6K | FL_ARCH6KZ)
++#define FL_FOR_ARCH6T2		(FL_FOR_ARCH6 | FL_THUMB2)
++#define FL_FOR_ARCH6M		(FL_FOR_ARCH6 & ~FL_NOTM)
++#define FL_FOR_ARCH7		((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
++#define FL_FOR_ARCH7A		(FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
++#define FL_FOR_ARCH7VE		(FL_FOR_ARCH7A | FL_THUMB_DIV | FL_ARM_DIV)
++#define FL_FOR_ARCH7R		(FL_FOR_ARCH7A | FL_THUMB_DIV)
++#define FL_FOR_ARCH7M		(FL_FOR_ARCH7 | FL_THUMB_DIV)
++#define FL_FOR_ARCH7EM		(FL_FOR_ARCH7M | FL_ARCH7EM)
++#define FL_FOR_ARCH8A		(FL_FOR_ARCH7VE | FL_ARCH8)
++#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
++#define FL2_FOR_ARCH8_2A	(FL2_FOR_ARCH8_1A | FL2_ARCH8_2)
++#define FL_FOR_ARCH8M_BASE	(FL_FOR_ARCH6M | FL_ARCH8 | FL_THUMB_DIV)
++#define FL_FOR_ARCH8M_MAIN	(FL_FOR_ARCH7M | FL_ARCH8)
++
++/* There are too many feature bits to fit in a single word so the set of cpu and
++   fpu capabilities is a structure.  A feature set is created and manipulated
++   with the ARM_FSET macros.  */
++
++typedef struct
++{
++  unsigned cpu[2];
++} arm_feature_set;
++
++
++/* Initialize a feature set.  */
++
++#define ARM_FSET_MAKE(CPU1,CPU2) { { (CPU1), (CPU2) } }
++
++#define ARM_FSET_MAKE_CPU1(CPU1) ARM_FSET_MAKE ((CPU1), (FL_NONE))
++#define ARM_FSET_MAKE_CPU2(CPU2) ARM_FSET_MAKE ((FL_NONE), (CPU2))
++
++/* Accessors.  */
++
++#define ARM_FSET_CPU1(S) ((S).cpu[0])
++#define ARM_FSET_CPU2(S) ((S).cpu[1])
++
++/* Useful combinations.  */
++
++#define ARM_FSET_EMPTY ARM_FSET_MAKE (FL_NONE, FL_NONE)
++#define ARM_FSET_ANY ARM_FSET_MAKE (FL_ANY, FL_ANY)
++
++/* Tests for a specific CPU feature.  */
++
++#define ARM_FSET_HAS_CPU1(A, F)  \
++  (((A).cpu[0] & ((unsigned long)(F))) == ((unsigned long)(F)))
++#define ARM_FSET_HAS_CPU2(A, F)  \
++  (((A).cpu[1] & ((unsigned long)(F))) == ((unsigned long)(F)))
++#define ARM_FSET_HAS_CPU(A, F1, F2)				\
++  (ARM_FSET_HAS_CPU1 ((A), (F1)) && ARM_FSET_HAS_CPU2 ((A), (F2)))
++
++/* Add a feature to a feature set.  */
++
++#define ARM_FSET_ADD_CPU1(DST, F)		\
++  do {						\
++    (DST).cpu[0] |= (F);			\
++  } while (0)
++
++#define ARM_FSET_ADD_CPU2(DST, F)		\
++  do {						\
++    (DST).cpu[1] |= (F);			\
++  } while (0)
++
++/* Remove a feature from a feature set.  */
++
++#define ARM_FSET_DEL_CPU1(DST, F)		\
++  do {						\
++    (DST).cpu[0] &= ~(F);			\
++  } while (0)
++
++#define ARM_FSET_DEL_CPU2(DST, F)		\
++  do {						\
++    (DST).cpu[1] &= ~(F);			\
++  } while (0)
++
++/* Union of feature sets.  */
++
++#define ARM_FSET_UNION(DST,F1,F2)		\
++  do {						\
++    (DST).cpu[0] = (F1).cpu[0] | (F2).cpu[0];	\
++    (DST).cpu[1] = (F1).cpu[1] | (F2).cpu[1];	\
++  } while (0)
++
++/* Intersection of feature sets.  */
++
++#define ARM_FSET_INTER(DST,F1,F2)		\
++  do {						\
++    (DST).cpu[0] = (F1).cpu[0] & (F2).cpu[0];	\
++    (DST).cpu[1] = (F1).cpu[1] & (F2).cpu[1];	\
++  } while (0)
++
++/* Exclusive disjunction.  */
++
++#define ARM_FSET_XOR(DST,F1,F2)				\
++  do {							\
++    (DST).cpu[0] = (F1).cpu[0] ^ (F2).cpu[0];		\
++    (DST).cpu[1] = (F1).cpu[1] ^ (F2).cpu[1];		\
++  } while (0)
++
++/* Difference of feature sets: F1 excluding the elements of F2.  */
++
++#define ARM_FSET_EXCLUDE(DST,F1,F2)		\
++  do {						\
++    (DST).cpu[0] = (F1).cpu[0] & ~(F2).cpu[0];	\
++    (DST).cpu[1] = (F1).cpu[1] & ~(F2).cpu[1];	\
++  } while (0)
++
++/* Test for an empty feature set.  */
++
++#define ARM_FSET_IS_EMPTY(A)		\
++  (!((A).cpu[0]) && !((A).cpu[1]))
++
++/* Tests whether the cpu features of A are a subset of B.  */
++
++#define ARM_FSET_CPU_SUBSET(A,B)					\
++  ((((A).cpu[0] & (B).cpu[0]) == (A).cpu[0])				\
++   && (((A).cpu[1] & (B).cpu[1]) == (A).cpu[1]))
++
++#endif /* GCC_ARM_FLAGS_H */
+--- a/src/gcc/config/arm/arm-fpus.def
++++ b/src/gcc/config/arm/arm-fpus.def
+@@ -19,30 +19,29 @@
+ 
+ /* Before using #include to read this file, define a macro:
+ 
+-      ARM_FPU(NAME, MODEL, REV, VFP_REGS, FEATURES)
++      ARM_FPU(NAME, REV, VFP_REGS, FEATURES)
+ 
+    The arguments are the fields of struct arm_fpu_desc.
+ 
+    genopt.sh assumes no whitespace up to the first "," in each entry.  */
+ 
+-ARM_FPU("vfp",		ARM_FP_MODEL_VFP, 2, VFP_REG_D16, FPU_FL_NONE)
+-ARM_FPU("vfpv3",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
+-ARM_FPU("vfpv3-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_FP16)
+-ARM_FPU("vfpv3-d16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D16, FPU_FL_NONE)
+-ARM_FPU("vfpv3-d16-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D16, FPU_FL_FP16)
+-ARM_FPU("vfpv3xd",	ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, FPU_FL_NONE)
+-ARM_FPU("vfpv3xd-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, FPU_FL_FP16)
+-ARM_FPU("neon",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NEON)
+-ARM_FPU("neon-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
+-ARM_FPU("vfpv4",	ARM_FP_MODEL_VFP, 4, VFP_REG_D32, FPU_FL_FP16)
+-ARM_FPU("vfpv4-d16",	ARM_FP_MODEL_VFP, 4, VFP_REG_D16, FPU_FL_FP16)
+-ARM_FPU("fpv4-sp-d16",	ARM_FP_MODEL_VFP, 4, VFP_REG_SINGLE, FPU_FL_FP16)
+-ARM_FPU("fpv5-sp-d16",	ARM_FP_MODEL_VFP, 5, VFP_REG_SINGLE, FPU_FL_FP16)
+-ARM_FPU("fpv5-d16",	ARM_FP_MODEL_VFP, 5, VFP_REG_D16, FPU_FL_FP16)
+-ARM_FPU("neon-vfpv4",	ARM_FP_MODEL_VFP, 4, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
+-ARM_FPU("fp-armv8",	ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_FP16)
+-ARM_FPU("neon-fp-armv8",ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
+-ARM_FPU("crypto-neon-fp-armv8",
+-			ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
++ARM_FPU("vfp",		2, VFP_REG_D16, FPU_FL_NONE)
++ARM_FPU("vfpv3",	3, VFP_REG_D32, FPU_FL_NONE)
++ARM_FPU("vfpv3-fp16",	3, VFP_REG_D32, FPU_FL_FP16)
++ARM_FPU("vfpv3-d16",	3, VFP_REG_D16, FPU_FL_NONE)
++ARM_FPU("vfpv3-d16-fp16", 3, VFP_REG_D16, FPU_FL_FP16)
++ARM_FPU("vfpv3xd",	3, VFP_REG_SINGLE, FPU_FL_NONE)
++ARM_FPU("vfpv3xd-fp16",	3, VFP_REG_SINGLE, FPU_FL_FP16)
++ARM_FPU("neon",		3, VFP_REG_D32, FPU_FL_NEON)
++ARM_FPU("neon-fp16",	3, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
++ARM_FPU("vfpv4",	4, VFP_REG_D32, FPU_FL_FP16)
++ARM_FPU("vfpv4-d16",	4, VFP_REG_D16, FPU_FL_FP16)
++ARM_FPU("fpv4-sp-d16",	4, VFP_REG_SINGLE, FPU_FL_FP16)
++ARM_FPU("fpv5-sp-d16",	5, VFP_REG_SINGLE, FPU_FL_FP16)
++ARM_FPU("fpv5-d16",	5, VFP_REG_D16, FPU_FL_FP16)
++ARM_FPU("neon-vfpv4",	4, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
++ARM_FPU("fp-armv8",	8, VFP_REG_D32, FPU_FL_FP16)
++ARM_FPU("neon-fp-armv8", 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
++ARM_FPU("crypto-neon-fp-armv8", 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
+ /* Compatibility aliases.  */
+-ARM_FPU("vfp3",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
++ARM_FPU("vfp3",		3, VFP_REG_D32, FPU_FL_NONE)
 --- a/src/gcc/config/arm/arm-modes.def
 +++ b/src/gcc/config/arm/arm-modes.def
 @@ -59,6 +59,7 @@ CC_MODE (CC_DGEU);
@@ -49712,9 +52050,62 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  /* Vector modes.  */
  VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
+--- a/src/gcc/config/arm/arm-opts.h
++++ b/src/gcc/config/arm/arm-opts.h
+@@ -25,6 +25,8 @@
+ #ifndef ARM_OPTS_H
+ #define ARM_OPTS_H
+ 
++#include "arm-flags.h"
++
+ /* The various ARM cores.  */
+ enum processor_type
+ {
+@@ -77,4 +79,24 @@ enum arm_tls_type {
+   TLS_GNU,
+   TLS_GNU2
+ };
++
++struct arm_arch_core_flag
++{
++  const char *const name;
++  const arm_feature_set flags;
++};
++
++static const struct arm_arch_core_flag arm_arch_core_flags[] =
++{
++#undef ARM_CORE
++#define ARM_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
++  {NAME, FLAGS},
++#include "arm-cores.def"
++#undef ARM_CORE
++#undef ARM_ARCH
++#define ARM_ARCH(NAME, CORE, ARCH, FLAGS) \
++  {NAME, FLAGS},
++#include "arm-arches.def"
++#undef ARM_ARCH
++};
+ #endif
 --- a/src/gcc/config/arm/arm-protos.h
 +++ b/src/gcc/config/arm/arm-protos.h
-@@ -50,8 +50,12 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
+@@ -22,6 +22,8 @@
+ #ifndef GCC_ARM_PROTOS_H
+ #define GCC_ARM_PROTOS_H
+ 
++#include "arm-flags.h"
++
+ extern enum unwind_info_type arm_except_unwind_info (struct gcc_options *);
+ extern int use_return_insn (int, rtx);
+ extern bool use_simple_return_p (void);
+@@ -31,6 +33,7 @@ extern int arm_volatile_func (void);
+ extern void arm_expand_prologue (void);
+ extern void arm_expand_epilogue (bool);
+ extern void arm_declare_function_name (FILE *, const char *, tree);
++extern void arm_asm_declare_function_name (FILE *, const char *, tree);
+ extern void thumb2_expand_return (bool);
+ extern const char *arm_strip_name_encoding (const char *);
+ extern void arm_asm_output_labelref (FILE *, const char *);
+@@ -50,8 +53,12 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
  			      ATTRIBUTE_UNUSED);
  extern void arm_init_builtins (void);
  extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
@@ -49728,7 +52119,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  extern bool arm_vector_mode_supported_p (machine_mode);
  extern bool arm_small_register_classes_for_mode_p (machine_mode);
  extern int arm_hard_regno_mode_ok (unsigned int, machine_mode);
-@@ -161,6 +165,7 @@ extern const char *arm_output_iwmmxt_shift_immediate (const char *, rtx *, bool)
+@@ -130,6 +137,7 @@ extern int arm_const_double_inline_cost (rtx);
+ extern bool arm_const_double_by_parts (rtx);
+ extern bool arm_const_double_by_immediates (rtx);
+ extern void arm_emit_call_insn (rtx, rtx, bool);
++bool detect_cmse_nonsecure_call (tree);
+ extern const char *output_call (rtx *);
+ void arm_emit_movpair (rtx, rtx);
+ extern const char *output_mov_long_double_arm_from_arm (rtx *);
+@@ -161,6 +169,7 @@ extern const char *arm_output_iwmmxt_shift_immediate (const char *, rtx *, bool)
  extern const char *arm_output_iwmmxt_tinsr (rtx *);
  extern unsigned int arm_sync_loop_insns (rtx , rtx *);
  extern int arm_attr_length_push_multi(rtx, rtx);
@@ -49736,7 +52135,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  extern void arm_expand_compare_and_swap (rtx op[]);
  extern void arm_split_compare_and_swap (rtx op[]);
  extern void arm_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
-@@ -192,7 +197,6 @@ extern const char *thumb_call_via_reg (rtx);
+@@ -192,7 +201,6 @@ extern const char *thumb_call_via_reg (rtx);
  extern void thumb_expand_movmemqi (rtx *);
  extern rtx arm_return_addr (int, rtx);
  extern void thumb_reload_out_hi (rtx *);
@@ -49744,7 +52143,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  extern void thumb_set_return_address (rtx, rtx);
  extern const char *thumb1_output_casesi (rtx *);
  extern const char *thumb2_output_casesi (rtx *);
-@@ -319,6 +323,7 @@ extern int vfp3_const_double_for_bits (rtx);
+@@ -319,6 +327,7 @@ extern int vfp3_const_double_for_bits (rtx);
  
  extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
  					   rtx);
@@ -49752,18 +52151,59 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  extern bool arm_valid_symbolic_address_p (rtx);
  extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
  #endif /* RTX_CODE */
-@@ -388,36 +393,43 @@ extern bool arm_is_constant_pool_ref (rtx);
- #define FL_ARCH6KZ    (1 << 31)       /* ARMv6KZ architecture.  */
- 
- #define FL2_ARCH8_1   (1 << 0)	      /* Architecture 8.1.  */
-+#define FL2_ARCH8_2   (1 << 1)	      /* Architecture 8.2.  */
-+#define FL2_FP16INST  (1 << 2)	      /* FP16 Instructions for ARMv8.2 and
-+					 later.  */
- 
- /* Flags that only effect tuning, not available instructions.  */
- #define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
- 			 | FL_CO_PROC)
- 
+@@ -344,184 +353,6 @@ extern void arm_cpu_cpp_builtins (struct cpp_reader *);
+ 
+ extern bool arm_is_constant_pool_ref (rtx);
+ 
+-/* Flags used to identify the presence of processor capabilities.  */
+-
+-/* Bit values used to identify processor capabilities.  */
+-#define FL_NONE	      (0)	      /* No flags.  */
+-#define FL_ANY	      (0xffffffff)    /* All flags.  */
+-#define FL_CO_PROC    (1 << 0)        /* Has external co-processor bus */
+-#define FL_ARCH3M     (1 << 1)        /* Extended multiply */
+-#define FL_MODE26     (1 << 2)        /* 26-bit mode support */
+-#define FL_MODE32     (1 << 3)        /* 32-bit mode support */
+-#define FL_ARCH4      (1 << 4)        /* Architecture rel 4 */
+-#define FL_ARCH5      (1 << 5)        /* Architecture rel 5 */
+-#define FL_THUMB      (1 << 6)        /* Thumb aware */
+-#define FL_LDSCHED    (1 << 7)	      /* Load scheduling necessary */
+-#define FL_STRONG     (1 << 8)	      /* StrongARM */
+-#define FL_ARCH5E     (1 << 9)        /* DSP extensions to v5 */
+-#define FL_XSCALE     (1 << 10)	      /* XScale */
+-/* spare	      (1 << 11)	*/
+-#define FL_ARCH6      (1 << 12)       /* Architecture rel 6.  Adds
+-					 media instructions.  */
+-#define FL_VFPV2      (1 << 13)       /* Vector Floating Point V2.  */
+-#define FL_WBUF	      (1 << 14)	      /* Schedule for write buffer ops.
+-					 Note: ARM6 & 7 derivatives only.  */
+-#define FL_ARCH6K     (1 << 15)       /* Architecture rel 6 K extensions.  */
+-#define FL_THUMB2     (1 << 16)	      /* Thumb-2.  */
+-#define FL_NOTM	      (1 << 17)	      /* Instructions not present in the 'M'
+-					 profile.  */
+-#define FL_THUMB_DIV  (1 << 18)	      /* Hardware divide (Thumb mode).  */
+-#define FL_VFPV3      (1 << 19)       /* Vector Floating Point V3.  */
+-#define FL_NEON       (1 << 20)       /* Neon instructions.  */
+-#define FL_ARCH7EM    (1 << 21)	      /* Instructions present in the ARMv7E-M
+-					 architecture.  */
+-#define FL_ARCH7      (1 << 22)       /* Architecture 7.  */
+-#define FL_ARM_DIV    (1 << 23)	      /* Hardware divide (ARM mode).  */
+-#define FL_ARCH8      (1 << 24)       /* Architecture 8.  */
+-#define FL_CRC32      (1 << 25)	      /* ARMv8 CRC32 instructions.  */
+-
+-#define FL_SMALLMUL   (1 << 26)       /* Small multiply supported.  */
+-#define FL_NO_VOLATILE_CE   (1 << 27) /* No volatile memory in IT block.  */
+-
+-#define FL_IWMMXT     (1 << 29)	      /* XScale v2 or "Intel Wireless MMX technology".  */
+-#define FL_IWMMXT2    (1 << 30)       /* "Intel Wireless MMX2 technology".  */
+-#define FL_ARCH6KZ    (1 << 31)       /* ARMv6KZ architecture.  */
+-
+-#define FL2_ARCH8_1   (1 << 0)	      /* Architecture 8.1.  */
+-
+-/* Flags that only effect tuning, not available instructions.  */
+-#define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
+-			 | FL_CO_PROC)
+-
 -#define FL_FOR_ARCH2	FL_NOTM
 -#define FL_FOR_ARCH3	(FL_FOR_ARCH2 | FL_MODE32)
 -#define FL_FOR_ARCH3M	(FL_FOR_ARCH3 | FL_ARCH3M)
@@ -49788,39 +52228,115 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -#define FL_FOR_ARCH7M	(FL_FOR_ARCH7 | FL_THUMB_DIV)
 -#define FL_FOR_ARCH7EM  (FL_FOR_ARCH7M | FL_ARCH7EM)
 -#define FL_FOR_ARCH8A	(FL_FOR_ARCH7VE | FL_ARCH8)
-+#define FL_FOR_ARCH2		FL_NOTM
-+#define FL_FOR_ARCH3		(FL_FOR_ARCH2 | FL_MODE32)
-+#define FL_FOR_ARCH3M		(FL_FOR_ARCH3 | FL_ARCH3M)
-+#define FL_FOR_ARCH4		(FL_FOR_ARCH3M | FL_ARCH4)
-+#define FL_FOR_ARCH4T		(FL_FOR_ARCH4 | FL_THUMB)
-+#define FL_FOR_ARCH5		(FL_FOR_ARCH4 | FL_ARCH5)
-+#define FL_FOR_ARCH5T		(FL_FOR_ARCH5 | FL_THUMB)
-+#define FL_FOR_ARCH5E		(FL_FOR_ARCH5 | FL_ARCH5E)
-+#define FL_FOR_ARCH5TE		(FL_FOR_ARCH5E | FL_THUMB)
-+#define FL_FOR_ARCH5TEJ		FL_FOR_ARCH5TE
-+#define FL_FOR_ARCH6		(FL_FOR_ARCH5TE | FL_ARCH6)
-+#define FL_FOR_ARCH6J		FL_FOR_ARCH6
-+#define FL_FOR_ARCH6K		(FL_FOR_ARCH6 | FL_ARCH6K)
-+#define FL_FOR_ARCH6Z		FL_FOR_ARCH6
-+#define FL_FOR_ARCH6ZK		FL_FOR_ARCH6K
-+#define FL_FOR_ARCH6KZ		(FL_FOR_ARCH6K | FL_ARCH6KZ)
-+#define FL_FOR_ARCH6T2		(FL_FOR_ARCH6 | FL_THUMB2)
-+#define FL_FOR_ARCH6M		(FL_FOR_ARCH6 & ~FL_NOTM)
-+#define FL_FOR_ARCH7		((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
-+#define FL_FOR_ARCH7A		(FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
-+#define FL_FOR_ARCH7VE		(FL_FOR_ARCH7A | FL_THUMB_DIV | FL_ARM_DIV)
-+#define FL_FOR_ARCH7R		(FL_FOR_ARCH7A | FL_THUMB_DIV)
-+#define FL_FOR_ARCH7M		(FL_FOR_ARCH7 | FL_THUMB_DIV)
-+#define FL_FOR_ARCH7EM		(FL_FOR_ARCH7M | FL_ARCH7EM)
-+#define FL_FOR_ARCH8A		(FL_FOR_ARCH7VE | FL_ARCH8)
- #define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
-+#define FL2_FOR_ARCH8_2A	(FL2_FOR_ARCH8_1A | FL2_ARCH8_2)
-+#define FL_FOR_ARCH8M_BASE	(FL_FOR_ARCH6M | FL_ARCH8 | FL_THUMB_DIV)
-+#define FL_FOR_ARCH8M_MAIN	(FL_FOR_ARCH7M | FL_ARCH8)
- 
- /* There are too many feature bits to fit in a single word so the set of cpu and
-    fpu capabilities is a structure.  A feature set is created and manipulated
-@@ -601,6 +613,9 @@ extern int arm_tune_cortex_a9;
+-#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
+-
+-/* There are too many feature bits to fit in a single word so the set of cpu and
+-   fpu capabilities is a structure.  A feature set is created and manipulated
+-   with the ARM_FSET macros.  */
+-
+-typedef struct
+-{
+-  unsigned long cpu[2];
+-} arm_feature_set;
+-
+-
+-/* Initialize a feature set.  */
+-
+-#define ARM_FSET_MAKE(CPU1,CPU2) { { (CPU1), (CPU2) } }
+-
+-#define ARM_FSET_MAKE_CPU1(CPU1) ARM_FSET_MAKE ((CPU1), (FL_NONE))
+-#define ARM_FSET_MAKE_CPU2(CPU2) ARM_FSET_MAKE ((FL_NONE), (CPU2))
+-
+-/* Accessors.  */
+-
+-#define ARM_FSET_CPU1(S) ((S).cpu[0])
+-#define ARM_FSET_CPU2(S) ((S).cpu[1])
+-
+-/* Useful combinations.  */
+-
+-#define ARM_FSET_EMPTY ARM_FSET_MAKE (FL_NONE, FL_NONE)
+-#define ARM_FSET_ANY ARM_FSET_MAKE (FL_ANY, FL_ANY)
+-
+-/* Tests for a specific CPU feature.  */
+-
+-#define ARM_FSET_HAS_CPU1(A, F)  \
+-  (((A).cpu[0] & ((unsigned long)(F))) == ((unsigned long)(F)))
+-#define ARM_FSET_HAS_CPU2(A, F)  \
+-  (((A).cpu[1] & ((unsigned long)(F))) == ((unsigned long)(F)))
+-#define ARM_FSET_HAS_CPU(A, F1, F2)				\
+-  (ARM_FSET_HAS_CPU1 ((A), (F1)) && ARM_FSET_HAS_CPU2 ((A), (F2)))
+-
+-/* Add a feature to a feature set.  */
+-
+-#define ARM_FSET_ADD_CPU1(DST, F)		\
+-  do {						\
+-    (DST).cpu[0] |= (F);			\
+-  } while (0)
+-
+-#define ARM_FSET_ADD_CPU2(DST, F)		\
+-  do {						\
+-    (DST).cpu[1] |= (F);			\
+-  } while (0)
+-
+-/* Remove a feature from a feature set.  */
+-
+-#define ARM_FSET_DEL_CPU1(DST, F)		\
+-  do {						\
+-    (DST).cpu[0] &= ~(F);			\
+-  } while (0)
+-
+-#define ARM_FSET_DEL_CPU2(DST, F)		\
+-  do {						\
+-    (DST).cpu[1] &= ~(F);			\
+-  } while (0)
+-
+-/* Union of feature sets.  */
+-
+-#define ARM_FSET_UNION(DST,F1,F2)		\
+-  do {						\
+-    (DST).cpu[0] = (F1).cpu[0] | (F2).cpu[0];	\
+-    (DST).cpu[1] = (F1).cpu[1] | (F2).cpu[1];	\
+-  } while (0)
+-
+-/* Intersection of feature sets.  */
+-
+-#define ARM_FSET_INTER(DST,F1,F2)		\
+-  do {						\
+-    (DST).cpu[0] = (F1).cpu[0] & (F2).cpu[0];	\
+-    (DST).cpu[1] = (F1).cpu[1] & (F2).cpu[1];	\
+-  } while (0)
+-
+-/* Exclusive disjunction.  */
+-
+-#define ARM_FSET_XOR(DST,F1,F2)				\
+-  do {							\
+-    (DST).cpu[0] = (F1).cpu[0] ^ (F2).cpu[0];		\
+-    (DST).cpu[1] = (F1).cpu[1] ^ (F2).cpu[1];		\
+-  } while (0)
+-
+-/* Difference of feature sets: F1 excluding the elements of F2.  */
+-
+-#define ARM_FSET_EXCLUDE(DST,F1,F2)		\
+-  do {						\
+-    (DST).cpu[0] = (F1).cpu[0] & ~(F2).cpu[0];	\
+-    (DST).cpu[1] = (F1).cpu[1] & ~(F2).cpu[1];	\
+-  } while (0)
+-
+-/* Test for an empty feature set.  */
+-
+-#define ARM_FSET_IS_EMPTY(A)		\
+-  (!((A).cpu[0]) && !((A).cpu[1]))
+-
+-/* Tests whether the cpu features of A are a subset of B.  */
+-
+-#define ARM_FSET_CPU_SUBSET(A,B)					\
+-  ((((A).cpu[0] & (B).cpu[0]) == (A).cpu[0])				\
+-   && (((A).cpu[1] & (B).cpu[1]) == (A).cpu[1]))
+-
+ /* The bits in this mask specify which
+    instructions we are allowed to generate.  */
+ extern arm_feature_set insn_flags;
+@@ -601,6 +432,9 @@ extern int arm_tune_cortex_a9;
     interworking clean.  */
  extern int arm_cpp_interwork;
  
@@ -49832,7 +52348,23 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 --- a/src/gcc/config/arm/arm-tables.opt
 +++ b/src/gcc/config/arm/arm-tables.opt
-@@ -322,6 +322,9 @@ EnumValue
+@@ -307,9 +307,15 @@ EnumValue
+ Enum(processor_type) String(cortex-a17.cortex-a7) Value(cortexa17cortexa7)
+ 
+ EnumValue
++Enum(processor_type) String(cortex-m23) Value(cortexm23)
++
++EnumValue
+ Enum(processor_type) String(cortex-a32) Value(cortexa32)
+ 
+ EnumValue
++Enum(processor_type) String(cortex-m33) Value(cortexm33)
++
++EnumValue
+ Enum(processor_type) String(cortex-a35) Value(cortexa35)
+ 
+ EnumValue
+@@ -322,6 +328,9 @@ EnumValue
  Enum(processor_type) String(cortex-a72) Value(cortexa72)
  
  EnumValue
@@ -49842,7 +52374,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  Enum(processor_type) String(exynos-m1) Value(exynosm1)
  
  EnumValue
-@@ -336,6 +339,12 @@ Enum(processor_type) String(cortex-a57.cortex-a53) Value(cortexa57cortexa53)
+@@ -336,6 +345,12 @@ Enum(processor_type) String(cortex-a57.cortex-a53) Value(cortexa57cortexa53)
  EnumValue
  Enum(processor_type) String(cortex-a72.cortex-a53) Value(cortexa72cortexa53)
  
@@ -49855,7 +52387,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  Enum
  Name(arm_arch) Type(int)
  Known ARM architectures (for use with the -march= option):
-@@ -428,10 +437,25 @@ EnumValue
+@@ -428,10 +443,25 @@ EnumValue
  Enum(arm_arch) String(armv8.1-a+crc) Value(28)
  
  EnumValue
@@ -49885,19 +52417,39 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  Name(arm_fpu) Type(int)
 --- a/src/gcc/config/arm/arm-tune.md
 +++ b/src/gcc/config/arm/arm-tune.md
-@@ -34,6 +34,7 @@
+@@ -32,8 +32,10 @@
+ 	cortexr4f,cortexr5,cortexr7,
+ 	cortexr8,cortexm7,cortexm4,
  	cortexm3,marvell_pj4,cortexa15cortexa7,
- 	cortexa17cortexa7,cortexa32,cortexa35,
- 	cortexa53,cortexa57,cortexa72,
--	exynosm1,qdf24xx,xgene1,
+-	cortexa17cortexa7,cortexa32,cortexa35,
+-	cortexa53,cortexa57,cortexa72,
++	cortexa17cortexa7,cortexm23,cortexa32,
++	cortexm33,cortexa35,cortexa53,
++	cortexa57,cortexa72,cortexa73,
+ 	exynosm1,qdf24xx,xgene1,
 -	cortexa57cortexa53,cortexa72cortexa53"
-+	cortexa73,exynosm1,qdf24xx,
-+	xgene1,cortexa57cortexa53,cortexa72cortexa53,
-+	cortexa73cortexa35,cortexa73cortexa53"
++	cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,
++	cortexa73cortexa53"
  	(const (symbol_ref "((enum attr_tune) arm_tune)")))
 --- a/src/gcc/config/arm/arm.c
 +++ b/src/gcc/config/arm/arm.c
-@@ -104,7 +104,6 @@ static void arm_print_operand_address (FILE *, machine_mode, rtx);
+@@ -27,6 +27,7 @@
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "cfghooks.h"
+ #include "df.h"
+ #include "tm_p.h"
+@@ -61,6 +62,7 @@
+ #include "builtins.h"
+ #include "tm-constrs.h"
+ #include "rtl-iter.h"
++#include "gimplify.h"
+ 
+ /* This file should be included last.  */
+ #include "target-def.h"
+@@ -104,7 +106,6 @@ static void arm_print_operand_address (FILE *, machine_mode, rtx);
  static bool arm_print_operand_punct_valid_p (unsigned char code);
  static const char *fp_const_from_val (REAL_VALUE_TYPE *);
  static arm_cc get_arm_condition_code (rtx);
@@ -49905,7 +52457,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  static const char *output_multi_immediate (rtx *, const char *, const char *,
  					   int, HOST_WIDE_INT);
  static const char *shift_op (rtx, HOST_WIDE_INT *);
-@@ -249,8 +248,6 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
+@@ -135,6 +136,8 @@ static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *);
+ #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
+ static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *);
+ #endif
++static tree arm_handle_cmse_nonsecure_entry (tree *, tree, tree, int, bool *);
++static tree arm_handle_cmse_nonsecure_call (tree *, tree, tree, int, bool *);
+ static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT);
+ static void arm_output_function_prologue (FILE *, HOST_WIDE_INT);
+ static int arm_comp_type_attributes (const_tree, const_tree);
+@@ -249,8 +252,6 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
  static bool arm_output_addr_const_extra (FILE *, rtx);
  static bool arm_allocate_stack_slots_for_args (void);
  static bool arm_warn_func_return (tree);
@@ -49914,7 +52475,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  static tree arm_promoted_type (const_tree t);
  static tree arm_convert_to_type (tree type, tree expr);
  static bool arm_scalar_mode_supported_p (machine_mode);
-@@ -300,6 +297,9 @@ static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+@@ -300,6 +301,9 @@ static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
  static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
  
  static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
@@ -49924,7 +52485,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  

  /* Table of machine attributes.  */
  static const struct attribute_spec arm_attribute_table[] =
-@@ -463,7 +463,7 @@ static const struct attribute_spec arm_attribute_table[] =
+@@ -343,6 +347,11 @@ static const struct attribute_spec arm_attribute_table[] =
+   { "notshared",    0, 0, false, true, false, arm_handle_notshared_attribute,
+     false },
+ #endif
++  /* ARMv8-M Security Extensions support.  */
++  { "cmse_nonsecure_entry", 0, 0, true, false, false,
++    arm_handle_cmse_nonsecure_entry, false },
++  { "cmse_nonsecure_call", 0, 0, true, false, false,
++    arm_handle_cmse_nonsecure_call, true },
+   { NULL,           0, 0, false, false, false, NULL, false }
+ };
+ 

+@@ -463,7 +472,7 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef  TARGET_ASM_OUTPUT_MI_THUNK
  #define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk
  #undef  TARGET_ASM_CAN_OUTPUT_MI_THUNK
@@ -49933,7 +52506,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  #undef  TARGET_RTX_COSTS
  #define TARGET_RTX_COSTS arm_rtx_costs
-@@ -654,12 +654,6 @@ static const struct attribute_spec arm_attribute_table[] =
+@@ -654,12 +663,6 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_PREFERRED_RELOAD_CLASS
  #define TARGET_PREFERRED_RELOAD_CLASS arm_preferred_reload_class
  
@@ -49946,7 +52519,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef TARGET_PROMOTED_TYPE
  #define TARGET_PROMOTED_TYPE arm_promoted_type
  
-@@ -820,6 +814,13 @@ int arm_arch8 = 0;
+@@ -820,6 +823,13 @@ int arm_arch8 = 0;
  /* Nonzero if this chip supports the ARMv8.1 extensions.  */
  int arm_arch8_1 = 0;
  
@@ -49960,7 +52533,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Nonzero if this chip can benefit from load scheduling.  */
  int arm_ld_sched = 0;
  
-@@ -852,6 +853,9 @@ int arm_tune_cortex_a9 = 0;
+@@ -852,6 +862,9 @@ int arm_tune_cortex_a9 = 0;
     interworking clean.  */
  int arm_cpp_interwork = 0;
  
@@ -49970,7 +52543,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Nonzero if chip supports Thumb 2.  */
  int arm_arch_thumb2;
  
-@@ -2055,6 +2059,29 @@ const struct tune_params arm_xgene1_tune =
+@@ -892,6 +905,9 @@ int arm_condexec_masklen = 0;
+ /* Nonzero if chip supports the ARMv8 CRC instructions.  */
+ int arm_arch_crc = 0;
+ 
++/* Nonzero if chip supports the ARMv8-M security extensions.  */
++int arm_arch_cmse = 0;
++
+ /* Nonzero if the core has a very small, high-latency, multiply unit.  */
+ int arm_m_profile_small_mul = 0;
+ 
+@@ -2055,6 +2071,29 @@ const struct tune_params arm_xgene1_tune =
    tune_params::SCHED_AUTOPREF_OFF
  };
  
@@ -50000,7 +52583,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Branches can be dual-issued on Cortex-A5, so conditional execution is
     less appealing.  Set max_insns_skipped to a low value.  */
  
-@@ -2127,6 +2154,29 @@ const struct tune_params arm_cortex_a12_tune =
+@@ -2127,6 +2166,29 @@ const struct tune_params arm_cortex_a12_tune =
    tune_params::SCHED_AUTOPREF_OFF
  };
  
@@ -50030,7 +52613,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* armv7m tuning.  On Cortex-M4 cores for example, MOVW/MOVT take a single
     cycle to execute each.  An LDR from the constant pool also takes two cycles
     to execute, but mildly increases pipelining opportunity (consecutive
-@@ -2264,9 +2314,11 @@ static const struct processors *arm_selected_arch;
+@@ -2183,7 +2245,8 @@ const struct tune_params arm_cortex_m7_tune =
+ };
+ 
+ /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
+-   arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus.  */
++   arm_v6t2_tune.  It is used for cortex-m0, cortex-m1, cortex-m0plus and
++   cortex-m23.  */
+ const struct tune_params arm_v6m_tune =
+ {
+   arm_9e_rtx_costs,
+@@ -2264,16 +2327,18 @@ static const struct processors *arm_selected_arch;
  static const struct processors *arm_selected_cpu;
  static const struct processors *arm_selected_tune;
  
@@ -50044,7 +52637,27 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  /* Available values for -mfpu=.  */
  
-@@ -2907,7 +2959,8 @@ arm_option_override_internal (struct gcc_options *opts,
+ const struct arm_fpu_desc all_fpus[] =
+ {
+-#define ARM_FPU(NAME, MODEL, REV, VFP_REGS, FEATURES) \
+-  { NAME, MODEL, REV, VFP_REGS, FEATURES },
++#define ARM_FPU(NAME, REV, VFP_REGS, FEATURES) \
++  { NAME, REV, VFP_REGS, FEATURES },
+ #include "arm-fpus.def"
+ #undef ARM_FPU
+ };
+@@ -2752,8 +2817,8 @@ arm_option_check_internal (struct gcc_options *opts)
+   const struct arm_fpu_desc *fpu_desc = &all_fpus[opts->x_arm_fpu_index];
+ 
+   /* iWMMXt and NEON are incompatible.  */
+-    if (TARGET_IWMMXT && TARGET_VFP
+-      && ARM_FPU_FSET_HAS (fpu_desc->features, FPU_FL_NEON))
++    if (TARGET_IWMMXT
++	&& ARM_FPU_FSET_HAS (fpu_desc->features, FPU_FL_NEON))
+     error ("iWMMXt and NEON are incompatible");
+ 
+   /* Make sure that the processor choice does not conflict with any of the
+@@ -2907,7 +2972,8 @@ arm_option_override_internal (struct gcc_options *opts,
    if (! opts_set->x_arm_restrict_it)
      opts->x_arm_restrict_it = arm_arch8;
  
@@ -50054,7 +52667,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      opts->x_arm_restrict_it = 0;
  
    /* Enable -munaligned-access by default for
-@@ -2918,7 +2971,8 @@ arm_option_override_internal (struct gcc_options *opts,
+@@ -2918,7 +2984,8 @@ arm_option_override_internal (struct gcc_options *opts,
  
       Disable -munaligned-access by default for
       - all pre-ARMv6 architecture-based processors
@@ -50064,7 +52677,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    if (! opts_set->x_unaligned_access)
      {
-@@ -3170,6 +3224,8 @@ arm_option_override (void)
+@@ -3170,6 +3237,8 @@ arm_option_override (void)
    arm_arch7em = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH7EM);
    arm_arch8 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH8);
    arm_arch8_1 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_1);
@@ -50073,9 +52686,11 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    arm_arch_thumb2 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB2);
    arm_arch_xscale = ARM_FSET_HAS_CPU1 (insn_flags, FL_XSCALE);
  
-@@ -3185,6 +3241,13 @@ arm_option_override (void)
+@@ -3184,7 +3253,15 @@ arm_option_override (void)
+   arm_arch_no_volatile_ce = ARM_FSET_HAS_CPU1 (insn_flags, FL_NO_VOLATILE_CE);
    arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0;
    arm_arch_crc = ARM_FSET_HAS_CPU1 (insn_flags, FL_CRC32);
++  arm_arch_cmse = ARM_FSET_HAS_CPU2 (insn_flags, FL2_CMSE);
    arm_m_profile_small_mul = ARM_FSET_HAS_CPU1 (insn_flags, FL_SMALLMUL);
 +  arm_fp16_inst = ARM_FSET_HAS_CPU2 (insn_flags, FL2_FP16INST);
 +  if (arm_fp16_inst)
@@ -50087,7 +52702,37 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    /* V5 code we generate is completely interworking capable, so we turn off
       TARGET_INTERWORK here to avoid many tests later on.  */
-@@ -3298,6 +3361,20 @@ arm_option_override (void)
+@@ -3222,10 +3299,8 @@ arm_option_override (void)
+   /* If soft-float is specified then don't use FPU.  */
+   if (TARGET_SOFT_FLOAT)
+     arm_fpu_attr = FPU_NONE;
+-  else if (TARGET_VFP)
+-    arm_fpu_attr = FPU_VFP;
+   else
+-    gcc_unreachable();
++    arm_fpu_attr = FPU_VFP;
+ 
+   if (TARGET_AAPCS_BASED)
+     {
+@@ -3245,15 +3320,14 @@ arm_option_override (void)
+       if (arm_abi == ARM_ABI_IWMMXT)
+ 	arm_pcs_default = ARM_PCS_AAPCS_IWMMXT;
+       else if (arm_float_abi == ARM_FLOAT_ABI_HARD
+-	       && TARGET_HARD_FLOAT
+-	       && TARGET_VFP)
++	       && TARGET_HARD_FLOAT)
+ 	arm_pcs_default = ARM_PCS_AAPCS_VFP;
+       else
+ 	arm_pcs_default = ARM_PCS_AAPCS;
+     }
+   else
+     {
+-      if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP)
++      if (arm_float_abi == ARM_FLOAT_ABI_HARD)
+ 	sorry ("-mfloat-abi=hard and VFP");
+ 
+       if (arm_abi == ARM_ABI_APCS)
+@@ -3298,6 +3372,20 @@ arm_option_override (void)
  	}
      }
  
@@ -50108,7 +52753,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    /* If stack checking is disabled, we can use r10 as the PIC register,
       which keeps r9 available.  The EABI specifies r9 as the PIC register.  */
    if (flag_pic && TARGET_SINGLE_PIC_BASE)
-@@ -3329,10 +3406,6 @@ arm_option_override (void)
+@@ -3329,10 +3417,6 @@ arm_option_override (void)
  	arm_pic_register = pic_register;
      }
  
@@ -50119,7 +52764,48 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    /* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores.  */
    if (fix_cm3_ldrd == 2)
      {
-@@ -3899,7 +3972,7 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
+@@ -3436,6 +3520,9 @@ arm_option_override (void)
+   if (target_slow_flash_data)
+     arm_disable_literal_pool = true;
+ 
++  if (use_cmse && !arm_arch_cmse)
++    error ("target CPU does not support ARMv8-M Security Extensions");
++
+   /* Disable scheduling fusion by default if it's not armv7 processor
+      or doesn't prefer ldrd/strd.  */
+   if (flag_schedule_fusion == 2
+@@ -3568,6 +3655,9 @@ arm_compute_func_type (void)
+   else
+     type |= arm_isr_value (TREE_VALUE (a));
+ 
++  if (lookup_attribute ("cmse_nonsecure_entry", attr))
++    type |= ARM_FT_CMSE_ENTRY;
++
+   return type;
+ }
+ 
+@@ -3794,6 +3884,11 @@ use_return_insn (int iscond, rtx sibling)
+ 	return 0;
+     }
+ 
++  /* ARMv8-M nonsecure entry function need to use bxns to return and thus need
++     several instructions if anything needs to be popped.  */
++  if (saved_int_regs && IS_CMSE_ENTRY (func_type))
++    return 0;
++
+   /* If there are saved registers but the LR isn't saved, then we need
+      two instructions for the return.  */
+   if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM)))
+@@ -3801,7 +3896,7 @@ use_return_insn (int iscond, rtx sibling)
+ 
+   /* Can't be done if any of the VFP regs are pushed,
+      since this also requires an insn.  */
+-  if (TARGET_HARD_FLOAT && TARGET_VFP)
++  if (TARGET_HARD_FLOAT)
+     for (regno = FIRST_VFP_REGNUM; regno <= LAST_VFP_REGNUM; regno++)
+       if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
+ 	return 0;
+@@ -3899,7 +3994,7 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
      {
      case SET:
        /* See if we can use movw.  */
@@ -50128,7 +52814,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	return 1;
        else
  	/* Otherwise, try mvn.  */
-@@ -4118,7 +4191,7 @@ optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
+@@ -4118,7 +4213,7 @@ optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
       yield a shorter sequence, we may as well use zero.  */
    insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start);
    if (best_start != 0
@@ -50137,7 +52823,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      {
        insns2 = optimal_immediate_sequence_1 (code, val, &tmp_sequence, 0);
        if (insns2 <= insns1)
-@@ -4949,7 +5022,7 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+@@ -4949,7 +5044,7 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
    if (mode == VOIDmode)
      mode = GET_MODE (*op1);
  
@@ -50146,7 +52832,24 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    /* For DImode, we have GE/LT/GEU/LTU comparisons.  In ARM mode
       we can also use cmp/cmpeq for GTU/LEU.  GT/LE must be either
-@@ -5549,7 +5622,7 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
+@@ -5255,7 +5350,6 @@ arm_function_value_regno_p (const unsigned int regno)
+   if (regno == ARG_REGISTER (1)
+       || (TARGET_32BIT
+ 	  && TARGET_AAPCS_BASED
+-	  && TARGET_VFP
+ 	  && TARGET_HARD_FLOAT
+ 	  && regno == FIRST_VFP_REGNUM)
+       || (TARGET_IWMMXT_ABI
+@@ -5274,7 +5368,7 @@ arm_apply_result_size (void)
+ 
+   if (TARGET_32BIT)
+     {
+-      if (TARGET_HARD_FLOAT_ABI && TARGET_VFP)
++      if (TARGET_HARD_FLOAT_ABI)
+ 	size += 32;
+       if (TARGET_IWMMXT_ABI)
+ 	size += 8;
+@@ -5549,7 +5643,7 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
      {
      case REAL_TYPE:
        mode = TYPE_MODE (type);
@@ -50155,7 +52858,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	return -1;
  
        if (*modep == VOIDmode)
-@@ -5797,11 +5870,16 @@ aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
+@@ -5722,7 +5816,7 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
+   if (pcs_variant != ARM_PCS_AAPCS_LOCAL)
+     return false;
+ 
+-  return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT &&
++  return (TARGET_32BIT && TARGET_HARD_FLOAT &&
+ 	  (TARGET_VFP_DOUBLE || !is_double));
+ }
+ 
+@@ -5797,11 +5891,16 @@ aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
  						&pcum->aapcs_vfp_rcount);
  }
  
@@ -50173,7 +52885,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1;
    int regno;
  
-@@ -5850,6 +5928,9 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
+@@ -5850,6 +5949,9 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
    return false;
  }
  
@@ -50183,7 +52895,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  static rtx
  aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
  			       machine_mode mode,
-@@ -5940,13 +6021,13 @@ static struct
+@@ -5940,13 +6042,13 @@ static struct
       required for a return from FUNCTION_ARG.  */
    bool (*allocate) (CUMULATIVE_ARGS *, machine_mode, const_tree);
  
@@ -50202,7 +52914,275 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    rtx (*allocate_return_reg) (enum arm_pcs, machine_mode, const_tree);
  
    /* Finish processing this argument and prepare to start processing
-@@ -8214,6 +8295,12 @@ arm_legitimate_constant_p_1 (machine_mode, rtx x)
+@@ -6561,6 +6663,185 @@ arm_handle_notshared_attribute (tree *node,
+ }
+ #endif
+ 
++/* This function returns true if a function with declaration FNDECL and type
++   FNTYPE uses the stack to pass arguments or return variables and false
++   otherwise.  This is used for functions with the attributes
++   'cmse_nonsecure_call' or 'cmse_nonsecure_entry' and this function will issue
++   diagnostic messages if the stack is used.  NAME is the name of the attribute
++   used.  */
++
++static bool
++cmse_func_args_or_return_in_stack (tree fndecl, tree name, tree fntype)
++{
++  function_args_iterator args_iter;
++  CUMULATIVE_ARGS args_so_far_v;
++  cumulative_args_t args_so_far;
++  bool first_param = true;
++  tree arg_type, prev_arg_type = NULL_TREE, ret_type;
++
++  /* Error out if any argument is passed on the stack.  */
++  arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX, fndecl);
++  args_so_far = pack_cumulative_args (&args_so_far_v);
++  FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter)
++    {
++      rtx arg_rtx;
++      machine_mode arg_mode = TYPE_MODE (arg_type);
++
++      prev_arg_type = arg_type;
++      if (VOID_TYPE_P (arg_type))
++	continue;
++
++      if (!first_param)
++	arm_function_arg_advance (args_so_far, arg_mode, arg_type, true);
++      arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type, true);
++      if (!arg_rtx
++	  || arm_arg_partial_bytes (args_so_far, arg_mode, arg_type, true))
++	{
++	  error ("%qE attribute not available to functions with arguments "
++		 "passed on the stack", name);
++	  return true;
++	}
++      first_param = false;
++    }
++
++  /* Error out for variadic functions since we cannot control how many
++     arguments will be passed and thus stack could be used.  stdarg_p () is not
++     used for the checking to avoid browsing arguments twice.  */
++  if (prev_arg_type != NULL_TREE && !VOID_TYPE_P (prev_arg_type))
++    {
++      error ("%qE attribute not available to functions with variable number "
++	     "of arguments", name);
++      return true;
++    }
++
++  /* Error out if return value is passed on the stack.  */
++  ret_type = TREE_TYPE (fntype);
++  if (arm_return_in_memory (ret_type, fntype))
++    {
++      error ("%qE attribute not available to functions that return value on "
++	     "the stack", name);
++      return true;
++    }
++  return false;
++}
++
++/* Called upon detection of the use of the cmse_nonsecure_entry attribute, this
++   function will check whether the attribute is allowed here and will add the
++   attribute to the function declaration tree or otherwise issue a warning.  */
++
++static tree
++arm_handle_cmse_nonsecure_entry (tree *node, tree name,
++				 tree /* args */,
++				 int /* flags */,
++				 bool *no_add_attrs)
++{
++  tree fndecl;
++
++  if (!use_cmse)
++    {
++      *no_add_attrs = true;
++      warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.",
++	       name);
++      return NULL_TREE;
++    }
++
++  /* Ignore attribute for function types.  */
++  if (TREE_CODE (*node) != FUNCTION_DECL)
++    {
++      warning (OPT_Wattributes, "%qE attribute only applies to functions",
++	       name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++
++  fndecl = *node;
++
++  /* Warn for static linkage functions.  */
++  if (!TREE_PUBLIC (fndecl))
++    {
++      warning (OPT_Wattributes, "%qE attribute has no effect on functions "
++	       "with static linkage", name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++
++  *no_add_attrs |= cmse_func_args_or_return_in_stack (fndecl, name,
++						TREE_TYPE (fndecl));
++  return NULL_TREE;
++}
++
++
++/* Called upon detection of the use of the cmse_nonsecure_call attribute, this
++   function will check whether the attribute is allowed here and will add the
++   attribute to the function type tree or otherwise issue a diagnostic.  The
++   reason we check this at declaration time is to only allow the use of the
++   attribute with declarations of function pointers and not function
++   declarations.  This function checks NODE is of the expected type and issues
++   diagnostics otherwise using NAME.  If it is not of the expected type
++   *NO_ADD_ATTRS will be set to true.  */
++
++static tree
++arm_handle_cmse_nonsecure_call (tree *node, tree name,
++				 tree /* args */,
++				 int /* flags */,
++				 bool *no_add_attrs)
++{
++  tree decl = NULL_TREE, fntype = NULL_TREE;
++  tree type;
++
++  if (!use_cmse)
++    {
++      *no_add_attrs = true;
++      warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.",
++	       name);
++      return NULL_TREE;
++    }
++
++  if (TREE_CODE (*node) == VAR_DECL || TREE_CODE (*node) == TYPE_DECL)
++    {
++      decl = *node;
++      fntype = TREE_TYPE (decl);
++    }
++
++  while (fntype != NULL_TREE && TREE_CODE (fntype) == POINTER_TYPE)
++    fntype = TREE_TYPE (fntype);
++
++  if (!decl || TREE_CODE (fntype) != FUNCTION_TYPE)
++    {
++	warning (OPT_Wattributes, "%qE attribute only applies to base type of a "
++		 "function pointer", name);
++	*no_add_attrs = true;
++	return NULL_TREE;
++    }
++
++  *no_add_attrs |= cmse_func_args_or_return_in_stack (NULL, name, fntype);
++
++  if (*no_add_attrs)
++    return NULL_TREE;
++
++  /* Prevent trees being shared among function types with and without
++     cmse_nonsecure_call attribute.  */
++  type = TREE_TYPE (decl);
++
++  type = build_distinct_type_copy (type);
++  TREE_TYPE (decl) = type;
++  fntype = type;
++
++  while (TREE_CODE (fntype) != FUNCTION_TYPE)
++    {
++      type = fntype;
++      fntype = TREE_TYPE (fntype);
++      fntype = build_distinct_type_copy (fntype);
++      TREE_TYPE (type) = fntype;
++    }
++
++  /* Construct a type attribute and add it to the function type.  */
++  tree attrs = tree_cons (get_identifier ("cmse_nonsecure_call"), NULL_TREE,
++			  TYPE_ATTRIBUTES (fntype));
++  TYPE_ATTRIBUTES (fntype) = attrs;
++  return NULL_TREE;
++}
++
+ /* Return 0 if the attributes for two types are incompatible, 1 if they
+    are compatible, and 2 if they are nearly compatible (which causes a
+    warning to be generated).  */
+@@ -6601,6 +6882,14 @@ arm_comp_type_attributes (const_tree type1, const_tree type2)
+   if (l1 != l2)
+     return 0;
+ 
++  l1 = lookup_attribute ("cmse_nonsecure_call",
++			 TYPE_ATTRIBUTES (type1)) != NULL;
++  l2 = lookup_attribute ("cmse_nonsecure_call",
++			 TYPE_ATTRIBUTES (type2)) != NULL;
++
++  if (l1 != l2)
++    return 0;
++
+   return 1;
+ }
+ 
+@@ -6719,6 +7008,20 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
+   if (IS_INTERRUPT (func_type))
+     return false;
+ 
++  /* ARMv8-M non-secure entry functions need to return with bxns which is only
++     generated for entry functions themselves.  */
++  if (IS_CMSE_ENTRY (arm_current_func_type ()))
++    return false;
++
++  /* We do not allow ARMv8-M non-secure calls to be turned into sibling calls,
++     this would complicate matters for later code generation.  */
++  if (TREE_CODE (exp) == CALL_EXPR)
++    {
++      tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
++      if (lookup_attribute ("cmse_nonsecure_call", TYPE_ATTRIBUTES (fntype)))
++	return false;
++    }
++
+   if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
+     {
+       /* Check that the return value locations are the same.  For
+@@ -7175,8 +7478,7 @@ arm_legitimate_address_outer_p (machine_mode mode, rtx x, RTX_CODE outer,
+     return 1;
+ 
+   use_ldrd = (TARGET_LDRD
+-	      && (mode == DImode
+-		  || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
++	      && (mode == DImode || mode == DFmode));
+ 
+   if (code == POST_INC || code == PRE_DEC
+       || ((code == PRE_INC || code == POST_DEC)
+@@ -7261,8 +7563,7 @@ thumb2_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
+     return 1;
+ 
+   use_ldrd = (TARGET_LDRD
+-	      && (mode == DImode
+-		  || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
++	      && (mode == DImode || mode == DFmode));
+ 
+   if (code == POST_INC || code == PRE_DEC
+       || ((code == PRE_INC || code == POST_DEC)
+@@ -7355,7 +7656,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, RTX_CODE outer,
+ 
+   /* Standard coprocessor addressing modes.  */
+   if (TARGET_HARD_FLOAT
+-      && TARGET_VFP
+       && (mode == SFmode || mode == DFmode))
+     return (code == CONST_INT && INTVAL (index) < 1024
+ 	    && INTVAL (index) > -1024
+@@ -7475,7 +7775,6 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, int strict_p)
+   /* ??? Combine arm and thumb2 coprocessor addressing modes.  */
+   /* Standard coprocessor addressing modes.  */
+   if (TARGET_HARD_FLOAT
+-      && TARGET_VFP
+       && (mode == SFmode || mode == DFmode))
+     return (code == CONST_INT && INTVAL (index) < 1024
+ 	    /* Thumb-2 allows only > -256 index range for it's core register
+@@ -8021,8 +8320,7 @@ arm_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
+ 
+ 	  /* VFP addressing modes actually allow greater offsets, but for
+ 	     now we just stick with the lowest common denominator.  */
+-	  if (mode == DImode
+-	      || ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode))
++	  if (mode == DImode || mode == DFmode)
+ 	    {
+ 	      low_n = n & 0x0f;
+ 	      n &= ~0x0f;
+@@ -8214,6 +8512,12 @@ arm_legitimate_constant_p_1 (machine_mode, rtx x)
  static bool
  thumb_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
  {
@@ -50215,7 +53195,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    return (CONST_INT_P (x)
  	  || CONST_DOUBLE_P (x)
  	  || CONSTANT_ADDRESS_P (x)
-@@ -8300,7 +8387,9 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -8300,7 +8604,9 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
      case CONST_INT:
        if (outer == SET)
  	{
@@ -50226,7 +53206,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	    return 0;
  	  if (thumb_shiftable_const (INTVAL (x)))
  	    return COSTS_N_INSNS (2);
-@@ -8317,8 +8406,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -8317,8 +8623,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
  	  int i;
  	  /* This duplicates the tests in the andsi3 expander.  */
  	  for (i = 9; i <= 31; i++)
@@ -50237,7 +53217,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      return COSTS_N_INSNS (2);
  	}
        else if (outer == ASHIFT || outer == ASHIFTRT
-@@ -9003,7 +9092,7 @@ static inline int
+@@ -9003,7 +9309,7 @@ static inline int
  thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
  {
    machine_mode mode = GET_MODE (x);
@@ -50246,7 +53226,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    switch (code)
      {
-@@ -9049,17 +9138,27 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -9049,17 +9355,27 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
        /* A SET doesn't have a mode, so let's look at the SET_DEST to get
  	 the mode.  */
        words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
@@ -50280,7 +53260,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  /* See split "TARGET_THUMB1 && satisfies_constraint_J".  */
  	  if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
              return COSTS_N_INSNS (2);
-@@ -9079,8 +9178,8 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -9079,8 +9395,8 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
            int i;
            /* This duplicates the tests in the andsi3 expander.  */
            for (i = 9; i <= 31; i++)
@@ -50291,7 +53271,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
                return COSTS_N_INSNS (2);
          }
        else if (outer == ASHIFT || outer == ASHIFTRT
-@@ -10759,8 +10858,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+@@ -10759,8 +11075,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        if ((arm_arch4 || GET_MODE (XEXP (x, 0)) == SImode)
  	  && MEM_P (XEXP (x, 0)))
  	{
@@ -50300,7 +53280,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  if (mode == DImode)
  	    *cost += COSTS_N_INSNS (1);
  
-@@ -12257,7 +12354,7 @@ vfp3_const_double_index (rtx x)
+@@ -12257,7 +12571,7 @@ vfp3_const_double_index (rtx x)
  
    /* We can permit four significant bits of mantissa only, plus a high bit
       which is always 1.  */
@@ -50309,7 +53289,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    if ((mantissa & mask) != 0)
      return -1;
  
-@@ -13139,7 +13236,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
+@@ -13139,7 +13453,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
  {
    if (mode == HFmode)
      {
@@ -50318,7 +53298,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	return GENERAL_REGS;
        if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2, true))
  	return NO_REGS;
-@@ -15976,14 +16073,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
+@@ -15976,14 +16290,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
    /* If the same input register is used in both stores
       when storing different constants, try to find a free register.
       For example, the code
@@ -50343,7 +53323,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    if (const_store
        && REGNO (operands[0]) == REGNO (operands[1])
        && INTVAL (operands[4]) != INTVAL (operands[5]))
-@@ -16002,7 +16102,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
+@@ -16002,7 +16319,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
        }
      else if (TARGET_ARM)
        {
@@ -50351,7 +53331,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
          int regno = REGNO (operands[0]);
          if (!peep2_reg_dead_p (4, operands[0]))
            {
-@@ -16356,7 +16455,7 @@ get_jump_table_size (rtx_jump_table_data *insn)
+@@ -16356,7 +16672,7 @@ get_jump_table_size (rtx_jump_table_data *insn)
  	{
  	case 1:
  	  /* Round up size  of TBB table to a halfword boundary.  */
@@ -50360,7 +53340,672 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  break;
  	case 2:
  	  /* No padding necessary for TBH.  */
-@@ -18588,6 +18687,8 @@ output_move_vfp (rtx *operands)
+@@ -17257,97 +17573,561 @@ note_invalid_constants (rtx_insn *insn, HOST_WIDE_INT address, int do_pushes)
+   return;
+ }
+ 
+-/* Rewrite move insn into subtract of 0 if the condition codes will
+-   be useful in next conditional jump insn.  */
++/* This function computes the clear mask and PADDING_BITS_TO_CLEAR for structs
++   and unions in the context of ARMv8-M Security Extensions.  It is used as a
++   helper function for both 'cmse_nonsecure_call' and 'cmse_nonsecure_entry'
++   functions.  The PADDING_BITS_TO_CLEAR pointer can be the base to either one
++   or four masks, depending on whether it is being computed for a
++   'cmse_nonsecure_entry' return value or a 'cmse_nonsecure_call' argument
++   respectively.  The tree for the type of the argument or a field within an
++   argument is passed in ARG_TYPE, the current register this argument or field
++   starts in is kept in the pointer REGNO and updated accordingly, the bit this
++   argument or field starts at is passed in STARTING_BIT and the last used bit
++   is kept in LAST_USED_BIT which is also updated accordingly.  */
++
++static unsigned HOST_WIDE_INT
++comp_not_to_clear_mask_str_un (tree arg_type, int * regno,
++			       uint32_t * padding_bits_to_clear,
++			       unsigned starting_bit, int * last_used_bit)
+ 
+-static void
+-thumb1_reorg (void)
+ {
+-  basic_block bb;
++  unsigned HOST_WIDE_INT not_to_clear_reg_mask = 0;
+ 
+-  FOR_EACH_BB_FN (bb, cfun)
++  if (TREE_CODE (arg_type) == RECORD_TYPE)
+     {
+-      rtx dest, src;
+-      rtx cmp, op0, op1, set = NULL;
+-      rtx_insn *prev, *insn = BB_END (bb);
+-      bool insn_clobbered = false;
++      unsigned current_bit = starting_bit;
++      tree field;
++      long int offset, size;
+ 
+-      while (insn != BB_HEAD (bb) && !NONDEBUG_INSN_P (insn))
+-	insn = PREV_INSN (insn);
+ 
+-      /* Find the last cbranchsi4_insn in basic block BB.  */
+-      if (insn == BB_HEAD (bb)
+-	  || INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
+-	continue;
++      field = TYPE_FIELDS (arg_type);
++      while (field)
++	{
++	  /* The offset within a structure is always an offset from
++	     the start of that structure.  Make sure we take that into the
++	     calculation of the register based offset that we use here.  */
++	  offset = starting_bit;
++	  offset += TREE_INT_CST_ELT (DECL_FIELD_BIT_OFFSET (field), 0);
++	  offset %= 32;
+ 
+-      /* Get the register with which we are comparing.  */
+-      cmp = XEXP (SET_SRC (PATTERN (insn)), 0);
+-      op0 = XEXP (cmp, 0);
+-      op1 = XEXP (cmp, 1);
++	  /* This is the actual size of the field, for bitfields this is the
++	     bitfield width and not the container size.  */
++	  size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
+ 
+-      /* Check that comparison is against ZERO.  */
+-      if (!CONST_INT_P (op1) || INTVAL (op1) != 0)
+-	continue;
++	  if (*last_used_bit != offset)
++	    {
++	      if (offset < *last_used_bit)
++		{
++		  /* This field's offset is before the 'last_used_bit', that
++		     means this field goes on the next register.  So we need to
++		     pad the rest of the current register and increase the
++		     register number.  */
++		  uint32_t mask;
++		  mask  = ((uint32_t)-1) - ((uint32_t) 1 << *last_used_bit);
++		  mask++;
++
++		  padding_bits_to_clear[*regno] |= mask;
++		  not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
++		  (*regno)++;
++		}
++	      else
++		{
++		  /* Otherwise we pad the bits between the last field's end and
++		     the start of the new field.  */
++		  uint32_t mask;
+ 
+-      /* Find the first flag setting insn before INSN in basic block BB.  */
+-      gcc_assert (insn != BB_HEAD (bb));
+-      for (prev = PREV_INSN (insn);
+-	   (!insn_clobbered
+-	    && prev != BB_HEAD (bb)
+-	    && (NOTE_P (prev)
+-		|| DEBUG_INSN_P (prev)
+-		|| ((set = single_set (prev)) != NULL
+-		    && get_attr_conds (prev) == CONDS_NOCOND)));
+-	   prev = PREV_INSN (prev))
+-	{
+-	  if (reg_set_p (op0, prev))
+-	    insn_clobbered = true;
+-	}
++		  mask = ((uint32_t)-1) >> (32 - offset);
++		  mask -= ((uint32_t) 1 << *last_used_bit) - 1;
++		  padding_bits_to_clear[*regno] |= mask;
++		}
++	      current_bit = offset;
++	    }
+ 
+-      /* Skip if op0 is clobbered by insn other than prev. */
+-      if (insn_clobbered)
+-	continue;
++	  /* Calculate further padding bits for inner structs/unions too.  */
++	  if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
++	    {
++	      *last_used_bit = current_bit;
++	      not_to_clear_reg_mask
++		|= comp_not_to_clear_mask_str_un (TREE_TYPE (field), regno,
++						  padding_bits_to_clear, offset,
++						  last_used_bit);
++	    }
++	  else
++	    {
++	      /* Update 'current_bit' with this field's size.  If the
++		 'current_bit' lies in a subsequent register, update 'regno' and
++		 reset 'current_bit' to point to the current bit in that new
++		 register.  */
++	      current_bit += size;
++	      while (current_bit >= 32)
++		{
++		  current_bit-=32;
++		  not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
++		  (*regno)++;
++		}
++	      *last_used_bit = current_bit;
++	    }
+ 
+-      if (!set)
+-	continue;
++	  field = TREE_CHAIN (field);
++	}
++      not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
++    }
++  else if (TREE_CODE (arg_type) == UNION_TYPE)
++    {
++      tree field, field_t;
++      int i, regno_t, field_size;
++      int max_reg = -1;
++      int max_bit = -1;
++      uint32_t mask;
++      uint32_t padding_bits_to_clear_res[NUM_ARG_REGS]
++	= {-1, -1, -1, -1};
++
++      /* To compute the padding bits in a union we only consider bits as
++	 padding bits if they are always either a padding bit or fall outside a
++	 fields size for all fields in the union.  */
++      field = TYPE_FIELDS (arg_type);
++      while (field)
++	{
++	  uint32_t padding_bits_to_clear_t[NUM_ARG_REGS]
++	    = {0U, 0U, 0U, 0U};
++	  int last_used_bit_t = *last_used_bit;
++	  regno_t = *regno;
++	  field_t = TREE_TYPE (field);
++
++	  /* If the field's type is either a record or a union make sure to
++	     compute their padding bits too.  */
++	  if (RECORD_OR_UNION_TYPE_P (field_t))
++	    not_to_clear_reg_mask
++	      |= comp_not_to_clear_mask_str_un (field_t, &regno_t,
++						&padding_bits_to_clear_t[0],
++						starting_bit, &last_used_bit_t);
++	  else
++	    {
++	      field_size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
++	      regno_t = (field_size / 32) + *regno;
++	      last_used_bit_t = (starting_bit + field_size) % 32;
++	    }
+ 
+-      dest = SET_DEST (set);
+-      src = SET_SRC (set);
+-      if (!low_register_operand (dest, SImode)
+-	  || !low_register_operand (src, SImode))
+-	continue;
++	  for (i = *regno; i < regno_t; i++)
++	    {
++	      /* For all but the last register used by this field only keep the
++		 padding bits that were padding bits in this field.  */
++	      padding_bits_to_clear_res[i] &= padding_bits_to_clear_t[i];
++	    }
+ 
+-      /* Rewrite move into subtract of 0 if its operand is compared with ZERO
+-	 in INSN.  Both src and dest of the move insn are checked.  */
+-      if (REGNO (op0) == REGNO (src) || REGNO (op0) == REGNO (dest))
+-	{
+-	  dest = copy_rtx (dest);
+-	  src = copy_rtx (src);
+-	  src = gen_rtx_MINUS (SImode, src, const0_rtx);
+-	  PATTERN (prev) = gen_rtx_SET (dest, src);
+-	  INSN_CODE (prev) = -1;
+-	  /* Set test register in INSN to dest.  */
+-	  XEXP (cmp, 0) = copy_rtx (dest);
+-	  INSN_CODE (insn) = -1;
++	    /* For the last register, keep all padding bits that were padding
++	       bits in this field and any padding bits that are still valid
++	       as padding bits but fall outside of this field's size.  */
++	    mask = (((uint32_t) -1) - ((uint32_t) 1 << last_used_bit_t)) + 1;
++	    padding_bits_to_clear_res[regno_t]
++	      &= padding_bits_to_clear_t[regno_t] | mask;
++
++	  /* Update the maximum size of the fields in terms of registers used
++	     ('max_reg') and the 'last_used_bit' in said register.  */
++	  if (max_reg < regno_t)
++	    {
++	      max_reg = regno_t;
++	      max_bit = last_used_bit_t;
++	    }
++	  else if (max_reg == regno_t && max_bit < last_used_bit_t)
++	    max_bit = last_used_bit_t;
++
++	  field = TREE_CHAIN (field);
+ 	}
++
++      /* Update the current padding_bits_to_clear using the intersection of the
++	 padding bits of all the fields.  */
++      for (i=*regno; i < max_reg; i++)
++	padding_bits_to_clear[i] |= padding_bits_to_clear_res[i];
++
++      /* Do not keep trailing padding bits, we do not know yet whether this
++	 is the end of the argument.  */
++      mask = ((uint32_t) 1 << max_bit) - 1;
++      padding_bits_to_clear[max_reg]
++	|= padding_bits_to_clear_res[max_reg] & mask;
++
++      *regno = max_reg;
++      *last_used_bit = max_bit;
+     }
++  else
++    /* This function should only be used for structs and unions.  */
++    gcc_unreachable ();
++
++  return not_to_clear_reg_mask;
+ }
+ 
+-/* Convert instructions to their cc-clobbering variant if possible, since
+-   that allows us to use smaller encodings.  */
++/* In the context of ARMv8-M Security Extensions, this function is used for both
++   'cmse_nonsecure_call' and 'cmse_nonsecure_entry' functions to compute what
++   registers are used when returning or passing arguments, which is then
++   returned as a mask.  It will also compute a mask to indicate padding/unused
++   bits for each of these registers, and passes this through the
++   PADDING_BITS_TO_CLEAR pointer.  The tree of the argument type is passed in
++   ARG_TYPE, the rtl representation of the argument is passed in ARG_RTX and
++   the starting register used to pass this argument or return value is passed
++   in REGNO.  It makes use of 'comp_not_to_clear_mask_str_un' to compute these
++   for struct and union types.  */
++
++static unsigned HOST_WIDE_INT
++compute_not_to_clear_mask (tree arg_type, rtx arg_rtx, int regno,
++			     uint32_t * padding_bits_to_clear)
+ 
+-static void
+-thumb2_reorg (void)
+ {
+-  basic_block bb;
+-  regset_head live;
++  int last_used_bit = 0;
++  unsigned HOST_WIDE_INT not_to_clear_mask;
+ 
+-  INIT_REG_SET (&live);
++  if (RECORD_OR_UNION_TYPE_P (arg_type))
++    {
++      not_to_clear_mask
++	= comp_not_to_clear_mask_str_un (arg_type, &regno,
++					 padding_bits_to_clear, 0,
++					 &last_used_bit);
+ 
+-  /* We are freeing block_for_insn in the toplev to keep compatibility
+-     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+-  compute_bb_for_insn ();
+-  df_analyze ();
++
++      /* If the 'last_used_bit' is not zero, that means we are still using a
++	 part of the last 'regno'.  In such cases we must clear the trailing
++	 bits.  Otherwise we are not using regno and we should mark it as to
++	 clear.  */
++      if (last_used_bit != 0)
++	padding_bits_to_clear[regno]
++	  |= ((uint32_t)-1) - ((uint32_t) 1 << last_used_bit) + 1;
++      else
++	not_to_clear_mask &= ~(HOST_WIDE_INT_1U << regno);
++    }
++  else
++    {
++      not_to_clear_mask = 0;
++      /* We are not dealing with structs nor unions.  So these arguments may be
++	 passed in floating point registers too.  In some cases a BLKmode is
++	 used when returning or passing arguments in multiple VFP registers.  */
++      if (GET_MODE (arg_rtx) == BLKmode)
++	{
++	  int i, arg_regs;
++	  rtx reg;
++
++	  /* This should really only occur when dealing with the hard-float
++	     ABI.  */
++	  gcc_assert (TARGET_HARD_FLOAT_ABI);
++
++	  for (i = 0; i < XVECLEN (arg_rtx, 0); i++)
++	    {
++	      reg = XEXP (XVECEXP (arg_rtx, 0, i), 0);
++	      gcc_assert (REG_P (reg));
++
++	      not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (reg);
++
++	      /* If we are dealing with DF mode, make sure we don't
++		 clear either of the registers it addresses.  */
++	      arg_regs = ARM_NUM_REGS (GET_MODE (reg));
++	      if (arg_regs > 1)
++		{
++		  unsigned HOST_WIDE_INT mask;
++		  mask = HOST_WIDE_INT_1U << (REGNO (reg) + arg_regs);
++		  mask -= HOST_WIDE_INT_1U << REGNO (reg);
++		  not_to_clear_mask |= mask;
++		}
++	    }
++	}
++      else
++	{
++	  /* Otherwise we can rely on the MODE to determine how many registers
++	     are being used by this argument.  */
++	  int arg_regs = ARM_NUM_REGS (GET_MODE (arg_rtx));
++	  not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (arg_rtx);
++	  if (arg_regs > 1)
++	    {
++	      unsigned HOST_WIDE_INT
++	      mask = HOST_WIDE_INT_1U << (REGNO (arg_rtx) + arg_regs);
++	      mask -= HOST_WIDE_INT_1U << REGNO (arg_rtx);
++	      not_to_clear_mask |= mask;
++	    }
++	}
++    }
++
++  return not_to_clear_mask;
++}
++
++/* Saves callee saved registers, clears callee saved registers and caller saved
++   registers not used to pass arguments before a cmse_nonsecure_call.  And
++   restores the callee saved registers after.  */
++
++static void
++cmse_nonsecure_call_clear_caller_saved (void)
++{
++  basic_block bb;
++
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      rtx_insn *insn;
++
++      FOR_BB_INSNS (bb, insn)
++	{
++	  uint64_t to_clear_mask, float_mask;
++	  rtx_insn *seq;
++	  rtx pat, call, unspec, reg, cleared_reg, tmp;
++	  unsigned int regno, maxregno;
++	  rtx address;
++	  CUMULATIVE_ARGS args_so_far_v;
++	  cumulative_args_t args_so_far;
++	  tree arg_type, fntype;
++	  bool using_r4, first_param = true;
++	  function_args_iterator args_iter;
++	  uint32_t padding_bits_to_clear[4] = {0U, 0U, 0U, 0U};
++	  uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear[0];
++
++	  if (!NONDEBUG_INSN_P (insn))
++	    continue;
++
++	  if (!CALL_P (insn))
++	    continue;
++
++	  pat = PATTERN (insn);
++	  gcc_assert (GET_CODE (pat) == PARALLEL && XVECLEN (pat, 0) > 0);
++	  call = XVECEXP (pat, 0, 0);
++
++	  /* Get the real call RTX if the insn sets a value, ie. returns.  */
++	  if (GET_CODE (call) == SET)
++	      call = SET_SRC (call);
++
++	  /* Check if it is a cmse_nonsecure_call.  */
++	  unspec = XEXP (call, 0);
++	  if (GET_CODE (unspec) != UNSPEC
++	      || XINT (unspec, 1) != UNSPEC_NONSECURE_MEM)
++	    continue;
++
++	  /* Determine the caller-saved registers we need to clear.  */
++	  to_clear_mask = (1LL << (NUM_ARG_REGS)) - 1;
++	  maxregno = NUM_ARG_REGS - 1;
++	  /* Only look at the caller-saved floating point registers in case of
++	     -mfloat-abi=hard.  For -mfloat-abi=softfp we will be using the
++	     lazy store and loads which clear both caller- and callee-saved
++	     registers.  */
++	  if (TARGET_HARD_FLOAT_ABI)
++	    {
++	      float_mask = (1LL << (D7_VFP_REGNUM + 1)) - 1;
++	      float_mask &= ~((1LL << FIRST_VFP_REGNUM) - 1);
++	      to_clear_mask |= float_mask;
++	      maxregno = D7_VFP_REGNUM;
++	    }
++
++	  /* Make sure the register used to hold the function address is not
++	     cleared.  */
++	  address = RTVEC_ELT (XVEC (unspec, 0), 0);
++	  gcc_assert (MEM_P (address));
++	  gcc_assert (REG_P (XEXP (address, 0)));
++	  to_clear_mask &= ~(1LL << REGNO (XEXP (address, 0)));
++
++	  /* Set basic block of call insn so that df rescan is performed on
++	     insns inserted here.  */
++	  set_block_for_insn (insn, bb);
++	  df_set_flags (DF_DEFER_INSN_RESCAN);
++	  start_sequence ();
++
++	  /* Make sure the scheduler doesn't schedule other insns beyond
++	     here.  */
++	  emit_insn (gen_blockage ());
++
++	  /* Walk through all arguments and clear registers appropriately.
++	  */
++	  fntype = TREE_TYPE (MEM_EXPR (address));
++	  arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX,
++				    NULL_TREE);
++	  args_so_far = pack_cumulative_args (&args_so_far_v);
++	  FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter)
++	    {
++	      rtx arg_rtx;
++	      machine_mode arg_mode = TYPE_MODE (arg_type);
++
++	      if (VOID_TYPE_P (arg_type))
++		continue;
++
++	      if (!first_param)
++		arm_function_arg_advance (args_so_far, arg_mode, arg_type,
++					  true);
++
++	      arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type,
++					  true);
++	      gcc_assert (REG_P (arg_rtx));
++	      to_clear_mask
++		&= ~compute_not_to_clear_mask (arg_type, arg_rtx,
++					       REGNO (arg_rtx),
++					       padding_bits_to_clear_ptr);
++
++	      first_param = false;
++	    }
++
++	  /* Clear padding bits where needed.  */
++	  cleared_reg = XEXP (address, 0);
++	  reg = gen_rtx_REG (SImode, IP_REGNUM);
++	  using_r4 = false;
++	  for (regno = R0_REGNUM; regno < NUM_ARG_REGS; regno++)
++	    {
++	      if (padding_bits_to_clear[regno] == 0)
++		continue;
++
++	      /* If this is a Thumb-1 target copy the address of the function
++		 we are calling from 'r4' into 'ip' such that we can use r4 to
++		 clear the unused bits in the arguments.  */
++	      if (TARGET_THUMB1 && !using_r4)
++		{
++		  using_r4 =  true;
++		  reg = cleared_reg;
++		  emit_move_insn (gen_rtx_REG (SImode, IP_REGNUM),
++					  reg);
++		}
++
++	      tmp = GEN_INT ((((~padding_bits_to_clear[regno]) << 16u) >> 16u));
++	      emit_move_insn (reg, tmp);
++	      /* Also fill the top half of the negated
++		 padding_bits_to_clear.  */
++	      if (((~padding_bits_to_clear[regno]) >> 16) > 0)
++		{
++		  tmp = GEN_INT ((~padding_bits_to_clear[regno]) >> 16);
++		  emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg,
++								GEN_INT (16),
++								GEN_INT (16)),
++					  tmp));
++		}
++
++	      emit_insn (gen_andsi3 (gen_rtx_REG (SImode, regno),
++				     gen_rtx_REG (SImode, regno),
++				     reg));
++
++	    }
++	  if (using_r4)
++	    emit_move_insn (cleared_reg,
++			    gen_rtx_REG (SImode, IP_REGNUM));
++
++	  /* We use right shift and left shift to clear the LSB of the address
++	     we jump to instead of using bic, to avoid having to use an extra
++	     register on Thumb-1.  */
++	  tmp = gen_rtx_LSHIFTRT (SImode, cleared_reg, const1_rtx);
++	  emit_insn (gen_rtx_SET (cleared_reg, tmp));
++	  tmp = gen_rtx_ASHIFT (SImode, cleared_reg, const1_rtx);
++	  emit_insn (gen_rtx_SET (cleared_reg, tmp));
++
++	  /* Clearing all registers that leak before doing a non-secure
++	     call.  */
++	  for (regno = R0_REGNUM; regno <= maxregno; regno++)
++	    {
++	      if (!(to_clear_mask & (1LL << regno)))
++		continue;
++
++	      /* If regno is an even vfp register and its successor is also to
++		 be cleared, use vmov.  */
++	      if (IS_VFP_REGNUM (regno))
++		{
++		  if (TARGET_VFP_DOUBLE
++		      && VFP_REGNO_OK_FOR_DOUBLE (regno)
++		      && to_clear_mask & (1LL << (regno + 1)))
++		    emit_move_insn (gen_rtx_REG (DFmode, regno++),
++				    CONST0_RTX (DFmode));
++		  else
++		    emit_move_insn (gen_rtx_REG (SFmode, regno),
++				    CONST0_RTX (SFmode));
++		}
++	      else
++		emit_move_insn (gen_rtx_REG (SImode, regno), cleared_reg);
++	    }
++
++	  seq = get_insns ();
++	  end_sequence ();
++	  emit_insn_before (seq, insn);
++
++	}
++    }
++}
++
++/* Rewrite move insn into subtract of 0 if the condition codes will
++   be useful in next conditional jump insn.  */
++
++static void
++thumb1_reorg (void)
++{
++  basic_block bb;
++
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      rtx dest, src;
++      rtx cmp, op0, op1, set = NULL;
++      rtx_insn *prev, *insn = BB_END (bb);
++      bool insn_clobbered = false;
++
++      while (insn != BB_HEAD (bb) && !NONDEBUG_INSN_P (insn))
++	insn = PREV_INSN (insn);
++
++      /* Find the last cbranchsi4_insn in basic block BB.  */
++      if (insn == BB_HEAD (bb)
++	  || INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
++	continue;
++
++      /* Get the register with which we are comparing.  */
++      cmp = XEXP (SET_SRC (PATTERN (insn)), 0);
++      op0 = XEXP (cmp, 0);
++      op1 = XEXP (cmp, 1);
++
++      /* Check that comparison is against ZERO.  */
++      if (!CONST_INT_P (op1) || INTVAL (op1) != 0)
++	continue;
++
++      /* Find the first flag setting insn before INSN in basic block BB.  */
++      gcc_assert (insn != BB_HEAD (bb));
++      for (prev = PREV_INSN (insn);
++	   (!insn_clobbered
++	    && prev != BB_HEAD (bb)
++	    && (NOTE_P (prev)
++		|| DEBUG_INSN_P (prev)
++		|| ((set = single_set (prev)) != NULL
++		    && get_attr_conds (prev) == CONDS_NOCOND)));
++	   prev = PREV_INSN (prev))
++	{
++	  if (reg_set_p (op0, prev))
++	    insn_clobbered = true;
++	}
++
++      /* Skip if op0 is clobbered by insn other than prev. */
++      if (insn_clobbered)
++	continue;
++
++      if (!set)
++	continue;
++
++      dest = SET_DEST (set);
++      src = SET_SRC (set);
++      if (!low_register_operand (dest, SImode)
++	  || !low_register_operand (src, SImode))
++	continue;
++
++      /* Rewrite move into subtract of 0 if its operand is compared with ZERO
++	 in INSN.  Both src and dest of the move insn are checked.  */
++      if (REGNO (op0) == REGNO (src) || REGNO (op0) == REGNO (dest))
++	{
++	  dest = copy_rtx (dest);
++	  src = copy_rtx (src);
++	  src = gen_rtx_MINUS (SImode, src, const0_rtx);
++	  PATTERN (prev) = gen_rtx_SET (dest, src);
++	  INSN_CODE (prev) = -1;
++	  /* Set test register in INSN to dest.  */
++	  XEXP (cmp, 0) = copy_rtx (dest);
++	  INSN_CODE (insn) = -1;
++	}
++    }
++}
++
++/* Convert instructions to their cc-clobbering variant if possible, since
++   that allows us to use smaller encodings.  */
++
++static void
++thumb2_reorg (void)
++{
++  basic_block bb;
++  regset_head live;
++
++  INIT_REG_SET (&live);
++
++  /* We are freeing block_for_insn in the toplev to keep compatibility
++     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
++  compute_bb_for_insn ();
++  df_analyze ();
+ 
+   enum Convert_Action {SKIP, CONV, SWAP_CONV};
+ 
+@@ -17557,6 +18337,8 @@ arm_reorg (void)
+   HOST_WIDE_INT address = 0;
+   Mfix * fix;
+ 
++  if (use_cmse)
++    cmse_nonsecure_call_clear_caller_saved ();
+   if (TARGET_THUMB1)
+     thumb1_reorg ();
+   else if (TARGET_THUMB2)
+@@ -17929,6 +18711,23 @@ vfp_emit_fstmd (int base_reg, int count)
+   return count * 8;
+ }
+ 
++/* Returns true if -mcmse has been passed and the function pointed to by 'addr'
++   has the cmse_nonsecure_call attribute and returns false otherwise.  */
++
++bool
++detect_cmse_nonsecure_call (tree addr)
++{
++  if (!addr)
++    return FALSE;
++
++  tree fntype = TREE_TYPE (addr);
++  if (use_cmse && lookup_attribute ("cmse_nonsecure_call",
++				    TYPE_ATTRIBUTES (fntype)))
++    return TRUE;
++  return FALSE;
++}
++
++
+ /* Emit a call instruction with pattern PAT.  ADDR is the address of
+    the call target.  */
+ 
+@@ -18588,6 +19387,8 @@ output_move_vfp (rtx *operands)
    rtx reg, mem, addr, ops[2];
    int load = REG_P (operands[0]);
    int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
@@ -50369,19 +54014,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT;
    const char *templ;
    char buff[50];
-@@ -18600,8 +18701,10 @@ output_move_vfp (rtx *operands)
+@@ -18600,8 +19401,10 @@ output_move_vfp (rtx *operands)
  
    gcc_assert (REG_P (reg));
    gcc_assert (IS_VFP_REGNUM (REGNO (reg)));
 -  gcc_assert (mode == SFmode
-+  gcc_assert ((mode == HFmode && TARGET_HARD_FLOAT && TARGET_VFP)
++  gcc_assert ((mode == HFmode && TARGET_HARD_FLOAT)
 +	      || mode == SFmode
  	      || mode == DFmode
 +	      || mode == HImode
  	      || mode == SImode
  	      || mode == DImode
                || (TARGET_NEON && VALID_NEON_DREG_MODE (mode)));
-@@ -18632,7 +18735,7 @@ output_move_vfp (rtx *operands)
+@@ -18632,7 +19435,7 @@ output_move_vfp (rtx *operands)
  
    sprintf (buff, templ,
  	   load ? "ld" : "st",
@@ -50390,7 +54035,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	   dp ? "P" : "",
  	   integer_p ? "\t%@ int" : "");
    output_asm_insn (buff, ops);
-@@ -19058,7 +19161,8 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
+@@ -19058,7 +19861,8 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
  	  return NULL;
  	}
  
@@ -50400,7 +54045,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        return ARM_LSL_NAME;
  
      default:
-@@ -19090,22 +19194,6 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
+@@ -19090,22 +19894,6 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
    return mnem;
  }
  
@@ -50423,7 +54068,163 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Output a .ascii pseudo-op, keeping track of lengths.  This is
     because /bin/as is horribly restrictive.  The judgement about
     whether or not each character is 'printable' (and can be output as
-@@ -22919,6 +23007,8 @@ maybe_get_arm_condition_code (rtx comparison)
+@@ -19462,7 +20250,7 @@ arm_get_vfp_saved_size (void)
+ 
+   saved = 0;
+   /* Space for saved VFP registers.  */
+-  if (TARGET_HARD_FLOAT && TARGET_VFP)
++  if (TARGET_HARD_FLOAT)
+     {
+       count = 0;
+       for (regno = FIRST_VFP_REGNUM;
+@@ -19551,6 +20339,7 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
+ 	 (e.g. interworking) then we can load the return address
+ 	 directly into the PC.  Otherwise we must load it into LR.  */
+       if (really_return
++	  && !IS_CMSE_ENTRY (func_type)
+ 	  && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK))
+ 	return_reg = reg_names[PC_REGNUM];
+       else
+@@ -19691,8 +20480,45 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
+ 	  break;
+ 
+ 	default:
++	  if (IS_CMSE_ENTRY (func_type))
++	    {
++	      /* Check if we have to clear the 'GE bits' which is only used if
++		 parallel add and subtraction instructions are available.  */
++	      if (TARGET_INT_SIMD)
++		snprintf (instr, sizeof (instr),
++			  "msr%s\tAPSR_nzcvqg, %%|lr", conditional);
++	      else
++		snprintf (instr, sizeof (instr),
++			  "msr%s\tAPSR_nzcvq, %%|lr", conditional);
++
++	      output_asm_insn (instr, & operand);
++	      if (TARGET_HARD_FLOAT && !TARGET_THUMB1)
++		{
++		  /* Clear the cumulative exception-status bits (0-4,7) and the
++		     condition code bits (28-31) of the FPSCR.  We need to
++		     remember to clear the first scratch register used (IP) and
++		     save and restore the second (r4).  */
++		  snprintf (instr, sizeof (instr), "push\t{%%|r4}");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "vmrs\t%%|ip, fpscr");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "movw\t%%|r4, #65376");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "movt\t%%|r4, #4095");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "and\t%%|ip, %%|r4");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "vmsr\tfpscr, %%|ip");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "pop\t{%%|r4}");
++		  output_asm_insn (instr, & operand);
++		  snprintf (instr, sizeof (instr), "mov\t%%|ip, %%|lr");
++		  output_asm_insn (instr, & operand);
++		}
++	      snprintf (instr, sizeof (instr), "bxns\t%%|lr");
++	    }
+ 	  /* Use bx if it's available.  */
+-	  if (arm_arch5 || arm_arch4t)
++	  else if (arm_arch5 || arm_arch4t)
+ 	    sprintf (instr, "bx%s\t%%|lr", conditional);
+ 	  else
+ 	    sprintf (instr, "mov%s\t%%|pc, %%|lr", conditional);
+@@ -19705,6 +20531,44 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
+   return "";
+ }
+ 
++/* Output in FILE asm statements needed to declare the NAME of the function
++   defined by its DECL node.  */
++
++void
++arm_asm_declare_function_name (FILE *file, const char *name, tree decl)
++{
++  size_t cmse_name_len;
++  char *cmse_name = 0;
++  char cmse_prefix[] = "__acle_se_";
++
++  /* When compiling with ARMv8-M Security Extensions enabled, we should print an
++     extra function label for each function with the 'cmse_nonsecure_entry'
++     attribute.  This extra function label should be prepended with
++     '__acle_se_', telling the linker that it needs to create secure gateway
++     veneers for this function.  */
++  if (use_cmse && lookup_attribute ("cmse_nonsecure_entry",
++				    DECL_ATTRIBUTES (decl)))
++    {
++      cmse_name_len = sizeof (cmse_prefix) + strlen (name);
++      cmse_name = XALLOCAVEC (char, cmse_name_len);
++      snprintf (cmse_name, cmse_name_len, "%s%s", cmse_prefix, name);
++      targetm.asm_out.globalize_label (file, cmse_name);
++
++      ARM_DECLARE_FUNCTION_NAME (file, cmse_name, decl);
++      ASM_OUTPUT_TYPE_DIRECTIVE (file, cmse_name, "function");
++    }
++
++  ARM_DECLARE_FUNCTION_NAME (file, name, decl);
++  ASM_OUTPUT_TYPE_DIRECTIVE (file, name, "function");
++  ASM_DECLARE_RESULT (file, DECL_RESULT (decl));
++  ASM_OUTPUT_LABEL (file, name);
++
++  if (cmse_name)
++    ASM_OUTPUT_LABEL (file, cmse_name);
++
++  ARM_OUTPUT_FN_UNWIND (file, TRUE);
++}
++
+ /* Write the function name into the code section, directly preceding
+    the function prologue.
+ 
+@@ -19754,10 +20618,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
+ {
+   unsigned long func_type;
+ 
+-  /* ??? Do we want to print some of the below anyway?  */
+-  if (TARGET_THUMB1)
+-    return;
+-
+   /* Sanity check.  */
+   gcc_assert (!arm_ccfsm_state && !arm_target_insn);
+ 
+@@ -19792,6 +20652,8 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
+     asm_fprintf (f, "\t%@ Nested: function declared inside another function.\n");
+   if (IS_STACKALIGN (func_type))
+     asm_fprintf (f, "\t%@ Stack Align: May be called with mis-aligned SP.\n");
++  if (IS_CMSE_ENTRY (func_type))
++    asm_fprintf (f, "\t%@ Non-secure entry function: called from non-secure code.\n");
+ 
+   asm_fprintf (f, "\t%@ args = %d, pretend = %d, frame = %wd\n",
+ 	       crtl->args.size,
+@@ -20461,7 +21323,7 @@ arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
+   REG_NOTES (par) = dwarf;
+ 
+   /* Make sure cfa doesn't leave with IP_REGNUM to allow unwinding fron FP.  */
+-  if (TARGET_VFP && REGNO (base_reg) == IP_REGNUM)
++  if (REGNO (base_reg) == IP_REGNUM)
+     {
+       RTX_FRAME_RELATED_P (par) = 1;
+       add_reg_note (par, REG_CFA_DEF_CFA, hard_frame_pointer_rtx);
+@@ -20922,7 +21784,7 @@ arm_get_frame_offsets (void)
+       func_type = arm_current_func_type ();
+       /* Space for saved VFP registers.  */
+       if (! IS_VOLATILE (func_type)
+-	  && TARGET_HARD_FLOAT && TARGET_VFP)
++	  && TARGET_HARD_FLOAT)
+ 	saved += arm_get_vfp_saved_size ();
+     }
+   else /* TARGET_THUMB1 */
+@@ -21143,7 +22005,7 @@ arm_save_coproc_regs(void)
+ 	saved_size += 8;
+       }
+ 
+-  if (TARGET_HARD_FLOAT && TARGET_VFP)
++  if (TARGET_HARD_FLOAT)
+     {
+       start_reg = FIRST_VFP_REGNUM;
+ 
+@@ -22923,6 +23785,8 @@ maybe_get_arm_condition_code (rtx comparison)
  	{
  	case LTU: return ARM_CS;
  	case GEU: return ARM_CC;
@@ -50432,7 +54233,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	default: return ARM_NV;
  	}
  
-@@ -22944,6 +23034,14 @@ maybe_get_arm_condition_code (rtx comparison)
+@@ -22948,6 +23812,14 @@ maybe_get_arm_condition_code (rtx comparison)
  	default: return ARM_NV;
  	}
  
@@ -50447,7 +54248,26 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      case CCmode:
        switch (comp_code)
  	{
-@@ -23397,10 +23495,12 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+@@ -23378,7 +24250,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+ {
+   if (GET_MODE_CLASS (mode) == MODE_CC)
+     return (regno == CC_REGNUM
+-	    || (TARGET_HARD_FLOAT && TARGET_VFP
++	    || (TARGET_HARD_FLOAT
+ 		&& regno == VFPCC_REGNUM));
+ 
+   if (regno == CC_REGNUM && GET_MODE_CLASS (mode) != MODE_CC)
+@@ -23392,8 +24264,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+        start of an even numbered register pair.  */
+     return (ARM_NUM_REGS (mode) < 2) || (regno < LAST_LO_REGNUM);
+ 
+-  if (TARGET_HARD_FLOAT && TARGET_VFP
+-      && IS_VFP_REGNUM (regno))
++  if (TARGET_HARD_FLOAT && IS_VFP_REGNUM (regno))
+     {
+       if (mode == SFmode || mode == SImode)
+ 	return VFP_REGNO_OK_FOR_SINGLE (regno);
+@@ -23401,10 +24272,12 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
        if (mode == DFmode)
  	return VFP_REGNO_OK_FOR_DOUBLE (regno);
  
@@ -50463,7 +54283,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
        if (TARGET_NEON)
          return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno))
-@@ -23604,26 +23704,6 @@ arm_debugger_arg_offset (int value, rtx addr)
+@@ -23608,26 +24481,6 @@ arm_debugger_arg_offset (int value, rtx addr)
    return value;
  }
  

@@ -50490,7 +54310,291 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Implement TARGET_PROMOTED_TYPE.  */
  
  static tree
-@@ -25847,13 +25927,6 @@ thumb_reload_out_hi (rtx *operands)
+@@ -23867,8 +24720,8 @@ thumb_pop (FILE *f, unsigned long mask)
+   if (mask & (1 << PC_REGNUM))
+     {
+       /* Catch popping the PC.  */
+-      if (TARGET_INTERWORK || TARGET_BACKTRACE
+-	  || crtl->calls_eh_return)
++      if (TARGET_INTERWORK || TARGET_BACKTRACE || crtl->calls_eh_return
++	  || IS_CMSE_ENTRY (arm_current_func_type ()))
+ 	{
+ 	  /* The PC is never poped directly, instead
+ 	     it is popped into r3 and then BX is used.  */
+@@ -23929,7 +24782,14 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
+       if (crtl->calls_eh_return)
+ 	asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
+ 
+-      asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
++      if (IS_CMSE_ENTRY (arm_current_func_type ()))
++	{
++	  asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n",
++		       reg_containing_return_addr);
++	  asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr);
++	}
++      else
++	asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
+       return;
+     }
+   /* Otherwise if we are not supporting interworking and we have not created
+@@ -23938,7 +24798,8 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
+   else if (!TARGET_INTERWORK
+ 	   && !TARGET_BACKTRACE
+ 	   && !is_called_in_ARM_mode (current_function_decl)
+-	   && !crtl->calls_eh_return)
++	   && !crtl->calls_eh_return
++	   && !IS_CMSE_ENTRY (arm_current_func_type ()))
+     {
+       asm_fprintf (f, "\tpop\t{%r}\n", PC_REGNUM);
+       return;
+@@ -24161,7 +25022,21 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
+     asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
+ 
+   /* Return to caller.  */
+-  asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
++  if (IS_CMSE_ENTRY (arm_current_func_type ()))
++    {
++      /* This is for the cases where LR is not being used to contain the return
++         address.  It may therefore contain information that we might not want
++	 to leak, hence it must be cleared.  The value in R0 will never be a
++	 secret at this point, so it is safe to use it, see the clearing code
++	 in 'cmse_nonsecure_entry_clear_before_return'.  */
++      if (reg_containing_return_addr != LR_REGNUM)
++	asm_fprintf (f, "\tmov\tlr, r0\n");
++
++      asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n", reg_containing_return_addr);
++      asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr);
++    }
++  else
++    asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
+ }
+ 

+ /* Scan INSN just before assembler is output for it.
+@@ -25026,6 +25901,149 @@ thumb1_expand_prologue (void)
+     cfun->machine->lr_save_eliminated = 0;
+ }
+ 
++/* Clear caller saved registers not used to pass return values and leaked
++   condition flags before exiting a cmse_nonsecure_entry function.  */
++
++void
++cmse_nonsecure_entry_clear_before_return (void)
++{
++  uint64_t to_clear_mask[2];
++  uint32_t padding_bits_to_clear = 0;
++  uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear;
++  int regno, maxregno = IP_REGNUM;
++  tree result_type;
++  rtx result_rtl;
++
++  to_clear_mask[0] = (1ULL << (NUM_ARG_REGS)) - 1;
++  to_clear_mask[0] |= (1ULL << IP_REGNUM);
++
++  /* If we are not dealing with -mfloat-abi=soft we will need to clear VFP
++     registers.  We also check that TARGET_HARD_FLOAT and !TARGET_THUMB1 hold
++     to make sure the instructions used to clear them are present.  */
++  if (TARGET_HARD_FLOAT && !TARGET_THUMB1)
++    {
++      uint64_t float_mask = (1ULL << (D7_VFP_REGNUM + 1)) - 1;
++      maxregno = LAST_VFP_REGNUM;
++
++      float_mask &= ~((1ULL << FIRST_VFP_REGNUM) - 1);
++      to_clear_mask[0] |= float_mask;
++
++      float_mask = (1ULL << (maxregno - 63)) - 1;
++      to_clear_mask[1] = float_mask;
++
++      /* Make sure we don't clear the two scratch registers used to clear the
++	 relevant FPSCR bits in output_return_instruction.  */
++      emit_use (gen_rtx_REG (SImode, IP_REGNUM));
++      to_clear_mask[0] &= ~(1ULL << IP_REGNUM);
++      emit_use (gen_rtx_REG (SImode, 4));
++      to_clear_mask[0] &= ~(1ULL << 4);
++    }
++
++  /* If the user has defined registers to be caller saved, these are no longer
++     restored by the function before returning and must thus be cleared for
++     security purposes.  */
++  for (regno = NUM_ARG_REGS; regno < LAST_VFP_REGNUM; regno++)
++    {
++      /* We do not touch registers that can be used to pass arguments as per
++	 the AAPCS, since these should never be made callee-saved by user
++	 options.  */
++      if (IN_RANGE (regno, FIRST_VFP_REGNUM, D7_VFP_REGNUM))
++	continue;
++      if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
++	continue;
++      if (call_used_regs[regno])
++	to_clear_mask[regno / 64] |= (1ULL << (regno % 64));
++    }
++
++  /* Make sure we do not clear the registers used to return the result in.  */
++  result_type = TREE_TYPE (DECL_RESULT (current_function_decl));
++  if (!VOID_TYPE_P (result_type))
++    {
++      result_rtl = arm_function_value (result_type, current_function_decl, 0);
++
++      /* No need to check that we return in registers, because we don't
++	 support returning on stack yet.  */
++      to_clear_mask[0]
++	&= ~compute_not_to_clear_mask (result_type, result_rtl, 0,
++				       padding_bits_to_clear_ptr);
++    }
++
++  if (padding_bits_to_clear != 0)
++    {
++      rtx reg_rtx;
++      /* Padding bits to clear is not 0 so we know we are dealing with
++	 returning a composite type, which only uses r0.  Let's make sure that
++	 r1-r3 is cleared too, we will use r1 as a scratch register.  */
++      gcc_assert ((to_clear_mask[0] & 0xe) == 0xe);
++
++      reg_rtx = gen_rtx_REG (SImode, R1_REGNUM);
++
++      /* Fill the lower half of the negated padding_bits_to_clear.  */
++      emit_move_insn (reg_rtx,
++		      GEN_INT ((((~padding_bits_to_clear) << 16u) >> 16u)));
++
++      /* Also fill the top half of the negated padding_bits_to_clear.  */
++      if (((~padding_bits_to_clear) >> 16) > 0)
++	emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg_rtx,
++						      GEN_INT (16),
++						      GEN_INT (16)),
++				GEN_INT ((~padding_bits_to_clear) >> 16)));
++
++      emit_insn (gen_andsi3 (gen_rtx_REG (SImode, R0_REGNUM),
++			   gen_rtx_REG (SImode, R0_REGNUM),
++			   reg_rtx));
++    }
++
++  for (regno = R0_REGNUM; regno <= maxregno; regno++)
++    {
++      if (!(to_clear_mask[regno / 64] & (1ULL << (regno % 64))))
++	continue;
++
++      if (IS_VFP_REGNUM (regno))
++	{
++	  /* If regno is an even vfp register and its successor is also to
++	     be cleared, use vmov.  */
++	  if (TARGET_VFP_DOUBLE
++	      && VFP_REGNO_OK_FOR_DOUBLE (regno)
++	      && to_clear_mask[regno / 64] & (1ULL << ((regno % 64) + 1)))
++	    {
++	      emit_move_insn (gen_rtx_REG (DFmode, regno),
++			      CONST1_RTX (DFmode));
++	      emit_use (gen_rtx_REG (DFmode, regno));
++	      regno++;
++	    }
++	  else
++	    {
++	      emit_move_insn (gen_rtx_REG (SFmode, regno),
++			      CONST1_RTX (SFmode));
++	      emit_use (gen_rtx_REG (SFmode, regno));
++	    }
++	}
++      else
++	{
++	  if (TARGET_THUMB1)
++	    {
++	      if (regno == R0_REGNUM)
++		emit_move_insn (gen_rtx_REG (SImode, regno),
++				const0_rtx);
++	      else
++		/* R0 has either been cleared before, see code above, or it
++		   holds a return value, either way it is not secret
++		   information.  */
++		emit_move_insn (gen_rtx_REG (SImode, regno),
++				gen_rtx_REG (SImode, R0_REGNUM));
++	      emit_use (gen_rtx_REG (SImode, regno));
++	    }
++	  else
++	    {
++	      emit_move_insn (gen_rtx_REG (SImode, regno),
++			      gen_rtx_REG (SImode, LR_REGNUM));
++	      emit_use (gen_rtx_REG (SImode, regno));
++	    }
++	}
++    }
++}
++
+ /* Generate pattern *pop_multiple_with_stack_update_and_return if single
+    POP instruction can be generated.  LR should be replaced by PC.  All
+    the checks required are already done by  USE_RETURN_INSN ().  Hence,
+@@ -25047,6 +26065,12 @@ thumb2_expand_return (bool simple_return)
+ 
+   if (!simple_return && saved_regs_mask)
+     {
++      /* TODO: Verify that this path is never taken for cmse_nonsecure_entry
++	 functions or adapt code to handle according to ACLE.  This path should
++	 not be reachable for cmse_nonsecure_entry functions though we prefer
++	 to assert it for now to ensure that future code changes do not silently
++	 change this behavior.  */
++      gcc_assert (!IS_CMSE_ENTRY (arm_current_func_type ()));
+       if (num_regs == 1)
+         {
+           rtx par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
+@@ -25069,6 +26093,8 @@ thumb2_expand_return (bool simple_return)
+     }
+   else
+     {
++      if (IS_CMSE_ENTRY (arm_current_func_type ()))
++	cmse_nonsecure_entry_clear_before_return ();
+       emit_jump_insn (simple_return_rtx);
+     }
+ }
+@@ -25127,6 +26153,10 @@ thumb1_expand_epilogue (void)
+ 
+   if (! df_regs_ever_live_p (LR_REGNUM))
+     emit_use (gen_rtx_REG (SImode, LR_REGNUM));
++
++  /* Clear all caller-saved regs that are not used to return.  */
++  if (IS_CMSE_ENTRY (arm_current_func_type ()))
++    cmse_nonsecure_entry_clear_before_return ();
+ }
+ 
+ /* Epilogue code for APCS frame.  */
+@@ -25161,7 +26191,7 @@ arm_expand_epilogue_apcs_frame (bool really_return)
+         floats_from_frame += 4;
+       }
+ 
+-  if (TARGET_HARD_FLOAT && TARGET_VFP)
++  if (TARGET_HARD_FLOAT)
+     {
+       int start_reg;
+       rtx ip_rtx = gen_rtx_REG (SImode, IP_REGNUM);
+@@ -25407,7 +26437,7 @@ arm_expand_epilogue (bool really_return)
+         }
+     }
+ 
+-  if (TARGET_HARD_FLOAT && TARGET_VFP)
++  if (TARGET_HARD_FLOAT)
+     {
+       /* Generate VFP register multi-pop.  */
+       int end_reg = LAST_VFP_REGNUM + 1;
+@@ -25464,6 +26494,7 @@ arm_expand_epilogue (bool really_return)
+ 
+       if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED
+           && (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL)
++	  && !IS_CMSE_ENTRY (func_type)
+           && !IS_STACKALIGN (func_type)
+           && really_return
+           && crtl->args.pretend_args_size == 0
+@@ -25560,6 +26591,14 @@ arm_expand_epilogue (bool really_return)
+ 				   stack_pointer_rtx, stack_pointer_rtx);
+     }
+ 
++    /* Clear all caller-saved regs that are not used to return.  */
++    if (IS_CMSE_ENTRY (arm_current_func_type ()))
++      {
++	/* CMSE_ENTRY always returns.  */
++	gcc_assert (really_return);
++	cmse_nonsecure_entry_clear_before_return ();
++      }
++
+   if (!really_return)
+     return;
+ 
+@@ -25856,13 +26895,6 @@ thumb_reload_out_hi (rtx *operands)
    emit_insn (gen_thumb_movhi_clobber (operands[0], operands[1], operands[2]));
  }
  
@@ -50504,7 +54608,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Return the length of a function name prefix
      that starts with the character 'c'.  */
  static int
-@@ -25991,7 +26064,7 @@ arm_file_start (void)
+@@ -26000,7 +27032,7 @@ arm_file_start (void)
  	      const char* pos = strchr (arm_selected_arch->name, '+');
  	      if (pos)
  		{
@@ -50513,7 +54617,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		  gcc_assert (strlen (arm_selected_arch->name)
  			      <= sizeof (buf) / sizeof (*pos));
  		  strncpy (buf, arm_selected_arch->name,
-@@ -26133,11 +26206,10 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
+@@ -26025,7 +27057,7 @@ arm_file_start (void)
+       if (print_tune_info)
+ 	arm_print_tune_info ();
+ 
+-      if (! TARGET_SOFT_FLOAT && TARGET_VFP)
++      if (! TARGET_SOFT_FLOAT)
+ 	{
+ 	  if (TARGET_HARD_FLOAT && TARGET_VFP_SINGLE)
+ 	    arm_emit_eabi_attribute ("Tag_ABI_HardFP_use", 27, 1);
+@@ -26142,11 +27174,10 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
  
  /* Output code to add DELTA to the first argument, and then jump
     to FUNCTION.  Used for C++ multiple inheritance.  */
@@ -50528,7 +54641,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  {
    static int thunk_label = 0;
    char label[256];
-@@ -26278,6 +26350,76 @@ arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -26287,6 +27318,76 @@ arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
    final_end_function ();
  }
  
@@ -50605,7 +54718,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  int
  arm_emit_vector_const (FILE *file, rtx x)
  {
-@@ -27733,7 +27875,7 @@ arm_preferred_rename_class (reg_class_t rclass)
+@@ -27671,7 +28772,7 @@ arm_conditional_register_usage (void)
+   if (TARGET_THUMB1)
+     fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
+ 
+-  if (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP)
++  if (TARGET_32BIT && TARGET_HARD_FLOAT)
+     {
+       /* VFPv3 registers are disabled when earlier VFP
+ 	 versions are selected due to the definition of
+@@ -27742,7 +28843,7 @@ arm_preferred_rename_class (reg_class_t rclass)
      return NO_REGS;
  }
  
@@ -50614,7 +54736,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     So this function MUST be kept in sync with that insn pattern.  */
  int
  arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
-@@ -27750,6 +27892,11 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
+@@ -27759,6 +28860,11 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
  
    /* Thumb2 mode.  */
    regno = REGNO (first_op);
@@ -50626,7 +54748,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
    for (i = 1; i < num_saves && !hi_reg; i++)
      {
-@@ -27762,6 +27909,56 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
+@@ -27771,6 +28877,56 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
    return 4;
  }
  
@@ -50683,7 +54805,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Compute the number of instructions emitted by output_move_double.  */
  int
  arm_count_output_move_double_insns (rtx *operands)
-@@ -27793,7 +27990,11 @@ vfp3_const_double_for_fract_bits (rtx operand)
+@@ -27802,7 +28958,11 @@ vfp3_const_double_for_fract_bits (rtx operand)
  	  HOST_WIDE_INT value = real_to_integer (&r0);
  	  value = value & 0xffffffff;
  	  if ((value != 0) && ( (value & (value - 1)) == 0))
@@ -50696,7 +54818,200 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	}
      }
    return 0;
-@@ -28350,6 +28551,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
+@@ -27942,9 +29102,9 @@ emit_unlikely_jump (rtx insn)
+ void
+ arm_expand_compare_and_swap (rtx operands[])
+ {
+-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
++  rtx bval, bdst, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
+   machine_mode mode;
+-  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
++  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+ 
+   bval = operands[0];
+   rval = operands[1];
+@@ -28001,43 +29161,54 @@ arm_expand_compare_and_swap (rtx operands[])
+       gcc_unreachable ();
+     }
+ 
+-  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
++  bdst = TARGET_THUMB1 ? bval : gen_rtx_REG (CCmode, CC_REGNUM);
++  emit_insn (gen (bdst, rval, mem, oldval, newval, is_weak, mod_s, mod_f));
+ 
+   if (mode == QImode || mode == HImode)
+     emit_move_insn (operands[1], gen_lowpart (mode, rval));
+ 
+   /* In all cases, we arrange for success to be signaled by Z set.
+      This arrangement allows for the boolean result to be used directly
+-     in a subsequent branch, post optimization.  */
+-  x = gen_rtx_REG (CCmode, CC_REGNUM);
+-  x = gen_rtx_EQ (SImode, x, const0_rtx);
+-  emit_insn (gen_rtx_SET (bval, x));
++     in a subsequent branch, post optimization.  For Thumb-1 targets, the
++     boolean negation of the result is also stored in bval because Thumb-1
++     backend lacks dependency tracking for CC flag due to flag-setting not
++     being represented at RTL level.  */
++  if (TARGET_THUMB1)
++      emit_insn (gen_cstoresi_eq0_thumb1 (bval, bdst));
++  else
++    {
++      x = gen_rtx_EQ (SImode, bdst, const0_rtx);
++      emit_insn (gen_rtx_SET (bval, x));
++    }
+ }
+ 
+ /* Split a compare and swap pattern.  It is IMPLEMENTATION DEFINED whether
+    another memory store between the load-exclusive and store-exclusive can
+    reset the monitor from Exclusive to Open state.  This means we must wait
+    until after reload to split the pattern, lest we get a register spill in
+-   the middle of the atomic sequence.  */
++   the middle of the atomic sequence.  Success of the compare and swap is
++   indicated by the Z flag set for 32bit targets and by neg_bval being zero
++   for Thumb-1 targets (ie. negation of the boolean value returned by
++   atomic_compare_and_swapmode standard pattern in operand 0).  */
+ 
+ void
+ arm_split_compare_and_swap (rtx operands[])
+ {
+-  rtx rval, mem, oldval, newval, scratch;
++  rtx rval, mem, oldval, newval, neg_bval;
+   machine_mode mode;
+   enum memmodel mod_s, mod_f;
+   bool is_weak;
+   rtx_code_label *label1, *label2;
+   rtx x, cond;
+ 
+-  rval = operands[0];
+-  mem = operands[1];
+-  oldval = operands[2];
+-  newval = operands[3];
+-  is_weak = (operands[4] != const0_rtx);
+-  mod_s = memmodel_from_int (INTVAL (operands[5]));
+-  mod_f = memmodel_from_int (INTVAL (operands[6]));
+-  scratch = operands[7];
++  rval = operands[1];
++  mem = operands[2];
++  oldval = operands[3];
++  newval = operands[4];
++  is_weak = (operands[5] != const0_rtx);
++  mod_s = memmodel_from_int (INTVAL (operands[6]));
++  mod_f = memmodel_from_int (INTVAL (operands[7]));
++  neg_bval = TARGET_THUMB1 ? operands[0] : operands[8];
+   mode = GET_MODE (mem);
+ 
+   bool is_armv8_sync = arm_arch8 && is_mm_sync (mod_s);
+@@ -28069,26 +29240,44 @@ arm_split_compare_and_swap (rtx operands[])
+ 
+   arm_emit_load_exclusive (mode, rval, mem, use_acquire);
+ 
+-  cond = arm_gen_compare_reg (NE, rval, oldval, scratch);
+-  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+-  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+-			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+-  emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
++  /* Z is set to 0 for 32bit targets (resp. rval set to 1) if oldval != rval,
++     as required to communicate with arm_expand_compare_and_swap.  */
++  if (TARGET_32BIT)
++    {
++      cond = arm_gen_compare_reg (NE, rval, oldval, neg_bval);
++      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
++      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
++				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
++      emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
++    }
++  else
++    {
++      emit_move_insn (neg_bval, const1_rtx);
++      cond = gen_rtx_NE (VOIDmode, rval, oldval);
++      if (thumb1_cmpneg_operand (oldval, SImode))
++	emit_unlikely_jump (gen_cbranchsi4_scratch (neg_bval, rval, oldval,
++						    label2, cond));
++      else
++	emit_unlikely_jump (gen_cbranchsi4_insn (cond, rval, oldval, label2));
++    }
+ 
+-  arm_emit_store_exclusive (mode, scratch, mem, newval, use_release);
++  arm_emit_store_exclusive (mode, neg_bval, mem, newval, use_release);
+ 
+   /* Weak or strong, we want EQ to be true for success, so that we
+      match the flags that we got from the compare above.  */
+-  cond = gen_rtx_REG (CCmode, CC_REGNUM);
+-  x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
+-  emit_insn (gen_rtx_SET (cond, x));
++  if (TARGET_32BIT)
++    {
++      cond = gen_rtx_REG (CCmode, CC_REGNUM);
++      x = gen_rtx_COMPARE (CCmode, neg_bval, const0_rtx);
++      emit_insn (gen_rtx_SET (cond, x));
++    }
+ 
+   if (!is_weak)
+     {
+-      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+-				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+-      emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
++      /* Z is set to boolean value of !neg_bval, as required to communicate
++	 with arm_expand_compare_and_swap.  */
++      x = gen_rtx_NE (VOIDmode, neg_bval, const0_rtx);
++      emit_unlikely_jump (gen_cbranchsi4 (x, neg_bval, const0_rtx, label1));
+     }
+ 
+   if (!is_mm_relaxed (mod_f))
+@@ -28103,6 +29292,15 @@ arm_split_compare_and_swap (rtx operands[])
+     emit_label (label2);
+ }
+ 
++/* Split an atomic operation pattern.  Operation is given by CODE and is one
++   of PLUS, MINUS, IOR, XOR, SET (for an exchange operation) or NOT (for a nand
++   operation).  Operation is performed on the content at MEM and on VALUE
++   following the memory model MODEL_RTX.  The content at MEM before and after
++   the operation is returned in OLD_OUT and NEW_OUT respectively while the
++   success of the operation is returned in COND.  Using a scratch register or
++   an operand register for these determines what result is returned for that
++   pattern.  */
++
+ void
+ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+ 		     rtx value, rtx model_rtx, rtx cond)
+@@ -28111,6 +29309,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+   machine_mode mode = GET_MODE (mem);
+   machine_mode wmode = (mode == DImode ? DImode : SImode);
+   rtx_code_label *label;
++  bool all_low_regs, bind_old_new;
+   rtx x;
+ 
+   bool is_armv8_sync = arm_arch8 && is_mm_sync (model);
+@@ -28145,6 +29344,28 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+ 
+   arm_emit_load_exclusive (mode, old_out, mem, use_acquire);
+ 
++  /* Does the operation require destination and first operand to use the same
++     register?  This is decided by register constraints of relevant insn
++     patterns in thumb1.md.  */
++  gcc_assert (!new_out || REG_P (new_out));
++  all_low_regs = REG_P (value) && REGNO_REG_CLASS (REGNO (value)) == LO_REGS
++		 && new_out && REGNO_REG_CLASS (REGNO (new_out)) == LO_REGS
++		 && REGNO_REG_CLASS (REGNO (old_out)) == LO_REGS;
++  bind_old_new =
++    (TARGET_THUMB1
++     && code != SET
++     && code != MINUS
++     && (code != PLUS || (!all_low_regs && !satisfies_constraint_L (value))));
++
++  /* We want to return the old value while putting the result of the operation
++     in the same register as the old value so copy the old value over to the
++     destination register and use that register for the operation.  */
++  if (old_out && bind_old_new)
++    {
++      emit_move_insn (new_out, old_out);
++      old_out = new_out;
++    }
++
+   switch (code)
+     {
+     case SET:
+@@ -28359,6 +29580,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
      case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
      case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
      case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
@@ -50705,7 +55020,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
      case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
      case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
-@@ -28423,6 +28626,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
+@@ -28432,6 +29655,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
      case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
      case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
      case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
@@ -50714,7 +55029,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
      case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
      case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
-@@ -28475,6 +28680,8 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+@@ -28484,6 +29709,8 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
  	case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
  	case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
  	case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
@@ -50723,7 +55038,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	default:
  	  return false;
  	}
-@@ -28558,6 +28765,8 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+@@ -28567,6 +29794,8 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
      case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
      case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
      case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
@@ -50732,7 +55047,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
      case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
      case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
-@@ -28633,6 +28842,8 @@ arm_evpc_neon_vext (struct expand_vec_perm_d *d)
+@@ -28642,6 +29871,8 @@ arm_evpc_neon_vext (struct expand_vec_perm_d *d)
      case V8HImode: gen = gen_neon_vextv8hi; break;
      case V2SImode: gen = gen_neon_vextv2si; break;
      case V4SImode: gen = gen_neon_vextv4si; break;
@@ -50741,7 +55056,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      case V2SFmode: gen = gen_neon_vextv2sf; break;
      case V4SFmode: gen = gen_neon_vextv4sf; break;
      case V2DImode: gen = gen_neon_vextv2di; break;
-@@ -29158,7 +29369,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
+@@ -29167,7 +30398,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
  {
    enum rtx_code code = GET_CODE (*comparison);
    int code_int;
@@ -50750,7 +55065,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      ? GET_MODE (*op2) : GET_MODE (*op1);
  
    gcc_assert (GET_MODE (*op1) != VOIDmode || GET_MODE (*op2) != VOIDmode);
-@@ -29186,6 +29397,14 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
+@@ -29195,11 +30426,19 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
  	*op2 = force_reg (mode, *op2);
        return true;
  
@@ -50764,8 +55079,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +      /* Fall through.  */
      case SFmode:
      case DFmode:
-       if (!arm_float_compare_operand (*op1, mode))
-@@ -29732,11 +29951,57 @@ arm_macro_fusion_p (void)
+-      if (!arm_float_compare_operand (*op1, mode))
++      if (!vfp_compare_operand (*op1, mode))
+ 	*op1 = force_reg (mode, *op1);
+-      if (!arm_float_compare_operand (*op2, mode))
++      if (!vfp_compare_operand (*op2, mode))
+ 	*op2 = force_reg (mode, *op2);
+       return true;
+     default:
+@@ -29741,11 +30980,57 @@ arm_macro_fusion_p (void)
    return current_tune->fusible_ops != tune_params::FUSE_NOTHING;
  }
  
@@ -50824,7 +55146,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    rtx prev_set = single_set (prev);
    rtx curr_set = single_set (curr);
  
-@@ -29754,54 +30019,26 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
+@@ -29763,54 +31048,26 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
        && aarch_crypto_can_dual_issue (prev, curr))
      return true;
  
@@ -50890,7 +55212,31 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  }
  
  
-@@ -30306,4 +30543,113 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
+@@ -29835,9 +31092,9 @@ arm_const_not_ok_for_debug_p (rtx p)
+ 	      && GET_CODE (XEXP (p, 0)) == SYMBOL_REF
+ 	      && (decl_op0 = SYMBOL_REF_DECL (XEXP (p, 0))))
+ 	    {
+-	      if ((TREE_CODE (decl_op1) == VAR_DECL
++	      if ((VAR_P (decl_op1)
+ 		   || TREE_CODE (decl_op1) == CONST_DECL)
+-		  && (TREE_CODE (decl_op0) == VAR_DECL
++		  && (VAR_P (decl_op0)
+ 		      || TREE_CODE (decl_op0) == CONST_DECL))
+ 		return (get_variable_section (decl_op1, false)
+ 			!= get_variable_section (decl_op0, false));
+@@ -29970,9 +31227,8 @@ arm_can_inline_p (tree caller, tree callee)
+   if ((caller_fpu->features & callee_fpu->features) != callee_fpu->features)
+     return false;
+ 
+-  /* Need same model and regs.  */
+-  if (callee_fpu->model != caller_fpu->model
+-      || callee_fpu->regs != callee_fpu->regs)
++  /* Need same FPU regs.  */
++  if (callee_fpu->regs != callee_fpu->regs)
+     return false;
+ 
+   /* OK to inline between different modes.
+@@ -30315,4 +31571,113 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
    return;
  }
  
@@ -51018,17 +55364,72 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Callback to output language specific object attributes.  */
  extern void (*arm_lang_output_object_attributes_hook)(void);
  

-@@ -194,7 +189,8 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
+@@ -139,7 +134,6 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
+ #define TARGET_HARD_FLOAT		(arm_float_abi != ARM_FLOAT_ABI_SOFT)
+ /* Use hardware floating point calling convention.  */
+ #define TARGET_HARD_FLOAT_ABI		(arm_float_abi == ARM_FLOAT_ABI_HARD)
+-#define TARGET_VFP		        (TARGET_FPU_MODEL == ARM_FP_MODEL_VFP)
+ #define TARGET_IWMMXT			(arm_arch_iwmmxt)
+ #define TARGET_IWMMXT2			(arm_arch_iwmmxt2)
+ #define TARGET_REALLY_IWMMXT		(TARGET_IWMMXT && TARGET_32BIT)
+@@ -177,50 +171,57 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
+    to be more careful with TARGET_NEON as noted below.  */
+ 
+ /* FPU is has the full VFPv3/NEON register file of 32 D registers.  */
+-#define TARGET_VFPD32 (TARGET_VFP && TARGET_FPU_REGS == VFP_REG_D32)
++#define TARGET_VFPD32 (TARGET_FPU_REGS == VFP_REG_D32)
+ 
+ /* FPU supports VFPv3 instructions.  */
+-#define TARGET_VFP3 (TARGET_VFP && TARGET_FPU_REV >= 3)
++#define TARGET_VFP3 (TARGET_FPU_REV >= 3)
+ 
+ /* FPU supports FPv5 instructions.  */
+-#define TARGET_VFP5 (TARGET_VFP && TARGET_FPU_REV >= 5)
++#define TARGET_VFP5 (TARGET_FPU_REV >= 5)
+ 
+ /* FPU only supports VFP single-precision instructions.  */
+-#define TARGET_VFP_SINGLE (TARGET_VFP && TARGET_FPU_REGS == VFP_REG_SINGLE)
++#define TARGET_VFP_SINGLE (TARGET_FPU_REGS == VFP_REG_SINGLE)
+ 
+ /* FPU supports VFP double-precision instructions.  */
+-#define TARGET_VFP_DOUBLE (TARGET_VFP && TARGET_FPU_REGS != VFP_REG_SINGLE)
++#define TARGET_VFP_DOUBLE (TARGET_FPU_REGS != VFP_REG_SINGLE)
+ 
  /* FPU supports half-precision floating-point with NEON element load/store.  */
- #define TARGET_NEON_FP16						\
-   (TARGET_VFP								\
+-#define TARGET_NEON_FP16						\
+-  (TARGET_VFP								\
 -   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON | FPU_FL_FP16))
-+   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON)		\
++#define TARGET_NEON_FP16					\
++  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON)		\
 +   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
  
  /* FPU supports VFP half-precision floating-point.  */
  #define TARGET_FP16							\
-@@ -221,6 +217,13 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
+-  (TARGET_VFP && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
++  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
+ 
+ /* FPU supports fused-multiply-add operations.  */
+-#define TARGET_FMA (TARGET_VFP && TARGET_FPU_REV >= 4)
++#define TARGET_FMA (TARGET_FPU_REV >= 4)
+ 
+ /* FPU is ARMv8 compatible.  */
+-#define TARGET_FPU_ARMV8 (TARGET_VFP && TARGET_FPU_REV >= 8)
++#define TARGET_FPU_ARMV8 (TARGET_FPU_REV >= 8)
+ 
+ /* FPU supports Crypto extensions.  */
+ #define TARGET_CRYPTO							\
+-  (TARGET_VFP && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_CRYPTO))
++  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_CRYPTO))
+ 
+ /* FPU supports Neon instructions.  The setting of this macro gets
+    revealed via __ARM_NEON__ so we add extra guards upon TARGET_32BIT
+    and TARGET_HARD_FLOAT to ensure that NEON instructions are
+    available.  */
+ #define TARGET_NEON							\
+-  (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP			\
++  (TARGET_32BIT && TARGET_HARD_FLOAT					\
+    && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON))
+ 
  /* FPU supports ARMv8.1 Adv.SIMD extensions.  */
  #define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1)
  
@@ -51042,7 +55443,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Q-bit is present.  */
  #define TARGET_ARM_QBIT \
    (TARGET_32BIT && arm_arch5e && (arm_arch_notm || arm_arch7))
-@@ -236,7 +239,7 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
+@@ -236,7 +237,7 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
  
  /* Should MOVW/MOVT be used in preference to a constant pool.  */
  #define TARGET_USE_MOVT \
@@ -51051,13 +55452,31 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     && (arm_disable_literal_pool \
         || (!optimize_size && !current_tune->prefer_constant_pool)))
  
-@@ -265,11 +268,22 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
- 			     || arm_arch7) && arm_arch_notm)
+@@ -251,14 +252,18 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
+ #define TARGET_HAVE_MEMORY_BARRIER (TARGET_HAVE_DMB || TARGET_HAVE_DMB_MCR)
+ 
+ /* Nonzero if this chip supports ldrex and strex */
+-#define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM) || arm_arch7)
++#define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM)	\
++				  || arm_arch7			\
++				  || (arm_arch8 && !arm_arch_notm))
+ 
+ /* Nonzero if this chip supports LPAE.  */
+ #define TARGET_HAVE_LPAE						\
+   (arm_arch7 && ARM_FSET_HAS_CPU1 (insn_flags, FL_FOR_ARCH7VE))
  
+ /* Nonzero if this chip supports ldrex{bh} and strex{bh}.  */
+-#define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM) || arm_arch7)
++#define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM)		\
++			     || arm_arch7			\
++			     || (arm_arch8 && !arm_arch_notm))
+ 
+ /* Nonzero if this chip supports ldrexd and strexd.  */
+ #define TARGET_HAVE_LDREXD (((arm_arch6k && TARGET_ARM) \
+@@ -267,9 +272,20 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
  /* Nonzero if this chip supports load-acquire and store-release.  */
--#define TARGET_HAVE_LDACQ	(TARGET_ARM_ARCH >= 8)
-+#define TARGET_HAVE_LDACQ	(TARGET_ARM_ARCH >= 8 && TARGET_32BIT)
-+
+ #define TARGET_HAVE_LDACQ	(TARGET_ARM_ARCH >= 8)
+ 
 +/* Nonzero if this chip supports LDAEXD and STLEXD.  */
 +#define TARGET_HAVE_LDACQEXD	(TARGET_ARM_ARCH >= 8	\
 +				 && TARGET_32BIT	\
@@ -51068,7 +55487,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +
 +/* Nonzero if this chip provides the CBZ and CBNZ instructions.  */
 +#define TARGET_HAVE_CBZ		(arm_arch_thumb2 || arm_arch8)
- 
++
  /* Nonzero if integer division instructions supported.  */
  #define TARGET_IDIV	((TARGET_ARM && arm_arch_arm_hwdiv)	\
 -			 || (TARGET_THUMB2 && arm_arch_thumb_hwdiv))
@@ -51076,6 +55495,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  /* Nonzero if disallow volatile memory access in IT block.  */
  #define TARGET_NO_VOLATILE_CE		(arm_arch_no_volatile_ce)
+@@ -349,7 +365,6 @@ enum vfp_reg_type
+ extern const struct arm_fpu_desc
+ {
+   const char *name;
+-  enum arm_fp_model model;
+   int rev;
+   enum vfp_reg_type regs;
+   arm_fpu_feature_set features;
+@@ -358,7 +373,6 @@ extern const struct arm_fpu_desc
+ /* Accessors.  */
+ 
+ #define TARGET_FPU_NAME     (all_fpus[arm_fpu_index].name)
+-#define TARGET_FPU_MODEL    (all_fpus[arm_fpu_index].model)
+ #define TARGET_FPU_REV      (all_fpus[arm_fpu_index].rev)
+ #define TARGET_FPU_REGS     (all_fpus[arm_fpu_index].regs)
+ #define TARGET_FPU_FEATURES (all_fpus[arm_fpu_index].features)
 @@ -402,7 +416,9 @@ enum base_architecture
    BASE_ARCH_7R = 7,
    BASE_ARCH_7M = 7,
@@ -51111,7 +55546,71 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Nonzero if chip supports Thumb 2.  */
  extern int arm_arch_thumb2;
  
-@@ -2187,13 +2213,9 @@ extern int making_const_table;
+@@ -502,6 +528,9 @@ extern bool arm_disable_literal_pool;
+ /* Nonzero if chip supports the ARMv8 CRC instructions.  */
+ extern int arm_arch_crc;
+ 
++/* Nonzero if chip supports the ARMv8-M Security Extensions.  */
++extern int arm_arch_cmse;
++
+ #ifndef TARGET_DEFAULT
+ #define TARGET_DEFAULT  (MASK_APCS_FRAME)
+ #endif
+@@ -1191,7 +1220,7 @@ enum reg_class
+    the data layout happens to be consistent for big-endian, so we explicitly allow
+    that case.  */
+ #define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)		\
+-  (TARGET_VFP && TARGET_BIG_END					\
++  (TARGET_BIG_END						\
+    && !(GET_MODE_SIZE (FROM) == 16 && GET_MODE_SIZE (TO) == 8)	\
+    && (GET_MODE_SIZE (FROM) > UNITS_PER_WORD			\
+        || GET_MODE_SIZE (TO) > UNITS_PER_WORD)			\
+@@ -1242,8 +1271,7 @@ enum reg_class
+    NO_REGS is returned.  */
+ #define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X)		\
+   /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
+-  ((TARGET_VFP && TARGET_HARD_FLOAT				\
+-    && IS_VFP_CLASS (CLASS))					\
++  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS))			\
+    ? coproc_secondary_reload_class (MODE, X, FALSE)		\
+    : (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS)			\
+    ? coproc_secondary_reload_class (MODE, X, TRUE)		\
+@@ -1255,8 +1283,7 @@ enum reg_class
+ /* If we need to load shorts byte-at-a-time, then we need a scratch.  */
+ #define SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X)		\
+   /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
+-  ((TARGET_VFP && TARGET_HARD_FLOAT				\
+-    && IS_VFP_CLASS (CLASS))					\
++  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS))			\
+     ? coproc_secondary_reload_class (MODE, X, FALSE) :		\
+     (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) ?			\
+     coproc_secondary_reload_class (MODE, X, TRUE) :		\
+@@ -1363,6 +1390,7 @@ enum reg_class
+ #define ARM_FT_VOLATILE		(1 << 4) /* Does not return.  */
+ #define ARM_FT_NESTED		(1 << 5) /* Embedded inside another func.  */
+ #define ARM_FT_STACKALIGN	(1 << 6) /* Called with misaligned stack.  */
++#define ARM_FT_CMSE_ENTRY	(1 << 7) /* ARMv8-M non-secure entry function.  */
+ 
+ /* Some macros to test these flags.  */
+ #define ARM_FUNC_TYPE(t)	(t & ARM_FT_TYPE_MASK)
+@@ -1371,6 +1399,7 @@ enum reg_class
+ #define IS_NAKED(t)        	(t & ARM_FT_NAKED)
+ #define IS_NESTED(t)       	(t & ARM_FT_NESTED)
+ #define IS_STACKALIGN(t)       	(t & ARM_FT_STACKALIGN)
++#define IS_CMSE_ENTRY(t)	(t & ARM_FT_CMSE_ENTRY)
+ 
+ 
+ /* Structure used to hold the function stack frame layout.  Offsets are
+@@ -1516,7 +1545,7 @@ typedef struct
+    On the ARM, r0-r3 are used to pass args.  */
+ #define FUNCTION_ARG_REGNO_P(REGNO)					\
+    (IN_RANGE ((REGNO), 0, 3)						\
+-    || (TARGET_AAPCS_BASED && TARGET_VFP && TARGET_HARD_FLOAT		\
++    || (TARGET_AAPCS_BASED && TARGET_HARD_FLOAT				\
+ 	&& IN_RANGE ((REGNO), FIRST_VFP_REGNUM, FIRST_VFP_REGNUM + 15))	\
+     || (TARGET_IWMMXT_ABI						\
+ 	&& IN_RANGE ((REGNO), FIRST_IWMMXT_REGNUM, FIRST_IWMMXT_REGNUM + 9)))
+@@ -2187,13 +2216,9 @@ extern int making_const_table;
  #define TARGET_ARM_ARCH	\
    (arm_base_arch)	\
  
@@ -51127,6 +55626,50 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  /* Expands to an upper-case char of the target's architectural
     profile.  */
+@@ -2245,13 +2270,18 @@ extern const char *arm_rewrite_mcpu (int argc, const char **argv);
+    "   :%{march=*:-march=%*}}"					\
+    BIG_LITTLE_SPEC
+ 
++extern const char *arm_target_thumb_only (int argc, const char **argv);
++#define TARGET_MODE_SPEC_FUNCTIONS					\
++  { "target_mode_check", arm_target_thumb_only },
++
+ /* -mcpu=native handling only makes sense with compiler running on
+    an ARM chip.  */
+ #if defined(__arm__)
+ extern const char *host_detect_local_cpu (int argc, const char **argv);
+ # define EXTRA_SPEC_FUNCTIONS						\
+   { "local_cpu_detect", host_detect_local_cpu },			\
+-  BIG_LITTLE_CPU_SPEC_FUNCTIONS
++  BIG_LITTLE_CPU_SPEC_FUNCTIONS						\
++  TARGET_MODE_SPEC_FUNCTIONS
+ 
+ # define MCPU_MTUNE_NATIVE_SPECS					\
+    " %{march=native:%<march=native %:local_cpu_detect(arch)}"		\
+@@ -2259,10 +2289,21 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
+    " %{mtune=native:%<mtune=native %:local_cpu_detect(tune)}"
+ #else
+ # define MCPU_MTUNE_NATIVE_SPECS ""
+-# define EXTRA_SPEC_FUNCTIONS BIG_LITTLE_CPU_SPEC_FUNCTIONS
++# define EXTRA_SPEC_FUNCTIONS						\
++	BIG_LITTLE_CPU_SPEC_FUNCTIONS					\
++	TARGET_MODE_SPEC_FUNCTIONS
+ #endif
+ 
+-#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS
++/* Automatically add -mthumb for Thumb-only targets if mode isn't specified
++   via the configuration option --with-mode or via the command line. The
++   function target_mode_check is called to do the check with either:
++   - an array of -march values if any is given;
++   - an array of -mcpu values if any is given;
++   - an empty array.  */
++#define TARGET_MODE_SPECS						\
++  " %{!marm:%{!mthumb:%:target_mode_check(%{march=*:%*;mcpu=*:%*;:})}}"
++
++#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS TARGET_MODE_SPECS
+ #define TARGET_SUPPORTS_WIDE_INT 1
+ 
+ /* For switching between functions with different target attributes.  */
 --- a/src/gcc/config/arm/arm.md
 +++ b/src/gcc/config/arm/arm.md
 @@ -118,10 +118,10 @@
@@ -51199,7 +55742,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "addsi3"
    [(set (match_operand:SI          0 "s_register_operand" "")
  	(plus:SI (match_operand:SI 1 "s_register_operand" "")
-@@ -616,6 +650,165 @@
+@@ -617,6 +651,165 @@
   ]
  )
  
@@ -51365,7 +55908,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "addsi3_compare0"
    [(set (reg:CC_NOOV CC_REGNUM)
  	(compare:CC_NOOV
-@@ -865,6 +1058,75 @@
+@@ -866,6 +1059,75 @@
      (set_attr "type" "adcs_reg")]
  )
  
@@ -51441,7 +55984,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*subsi3_carryin"
    [(set (match_operand:SI 0 "s_register_operand" "=r,r")
          (minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I")
-@@ -2136,13 +2398,13 @@
+@@ -1895,7 +2157,7 @@
+   [(set (match_operand:SF 0 "s_register_operand" "")
+ 	(div:SF (match_operand:SF 1 "s_register_operand" "")
+ 		(match_operand:SF 2 "s_register_operand" "")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "")
+ 
+ (define_expand "divdf3"
+@@ -2137,13 +2399,13 @@
  
            for (i = 9; i <= 31; i++)
  	    {
@@ -51457,7 +56009,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		       == ~INTVAL (operands[2]))
  	        {
  	          rtx shift = GEN_INT (i);
-@@ -2441,7 +2703,7 @@
+@@ -2442,7 +2704,7 @@
    {
      int start_bit = INTVAL (operands[2]);
      int width = INTVAL (operands[1]);
@@ -51466,7 +56018,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      rtx target, subtarget;
  
      if (arm_arch_thumb2)
-@@ -3743,8 +4005,7 @@
+@@ -3744,8 +4006,7 @@
      {
        rtx scratch1, scratch2;
  
@@ -51476,7 +56028,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
          {
            emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
            DONE;
-@@ -3789,7 +4050,7 @@
+@@ -3790,7 +4051,7 @@
    "TARGET_EITHER"
    "
    if (CONST_INT_P (operands[2])
@@ -51485,7 +56037,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      {
        emit_insn (gen_movsi (operands[0], const0_rtx));
        DONE;
-@@ -3817,8 +4078,7 @@
+@@ -3818,8 +4079,7 @@
      {
        rtx scratch1, scratch2;
  
@@ -51495,7 +56047,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
          {
            emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
            DONE;
-@@ -3863,7 +4123,7 @@
+@@ -3864,7 +4124,7 @@
    "TARGET_EITHER"
    "
    if (CONST_INT_P (operands[2])
@@ -51504,7 +56056,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      operands[2] = GEN_INT (31);
    "
  )
-@@ -3888,8 +4148,7 @@
+@@ -3889,8 +4149,7 @@
      {
        rtx scratch1, scratch2;
  
@@ -51514,7 +56066,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
          {
            emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
            DONE;
-@@ -3934,7 +4193,7 @@
+@@ -3935,7 +4194,7 @@
    "TARGET_EITHER"
    "
    if (CONST_INT_P (operands[2])
@@ -51523,7 +56075,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      {
        emit_insn (gen_movsi (operands[0], const0_rtx));
        DONE;
-@@ -3968,7 +4227,7 @@
+@@ -3969,7 +4228,7 @@
    if (TARGET_32BIT)
      {
        if (CONST_INT_P (operands[2])
@@ -51532,7 +56084,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
          operands[2] = GEN_INT (INTVAL (operands[2]) % 32);
      }
    else /* TARGET_THUMB1 */
-@@ -4325,23 +4584,29 @@
+@@ -4326,23 +4585,29 @@
  
  ;; Division instructions
  (define_insn "divsi3"
@@ -51572,7 +56124,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (set_attr "predicable_short_it" "no")
     (set_attr "type" "udiv")]
  )
-@@ -4349,6 +4614,63 @@
+@@ -4350,6 +4615,63 @@
  

  ;; Unary arithmetic insns
  
@@ -51636,7 +56188,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "negdi2"
   [(parallel
     [(set (match_operand:DI 0 "s_register_operand" "")
-@@ -4389,6 +4711,20 @@
+@@ -4390,6 +4712,20 @@
     (set_attr "type" "multiple")]
  )
  
@@ -51657,7 +56209,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "negsi2"
    [(set (match_operand:SI         0 "s_register_operand" "")
  	(neg:SI (match_operand:SI 1 "s_register_operand" "")))]
-@@ -4853,7 +5189,7 @@
+@@ -4412,7 +4748,7 @@
+ (define_expand "negsf2"
+   [(set (match_operand:SF         0 "s_register_operand" "")
+ 	(neg:SF (match_operand:SF 1 "s_register_operand" "")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   ""
+ )
+ 
+@@ -4685,7 +5021,7 @@
+ (define_expand "sqrtsf2"
+   [(set (match_operand:SF 0 "s_register_operand" "")
+ 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "")
+ 
+ (define_expand "sqrtdf2"
+@@ -4854,7 +5190,7 @@
    ""
  )
  
@@ -51666,7 +56236,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "truncdfhf2"
    [(set (match_operand:HF  0 "general_operand" "")
  	(float_truncate:HF
-@@ -5116,7 +5452,7 @@
+@@ -5117,7 +5453,7 @@
  		     (match_operator 5 "subreg_lowpart_operator"
  		      [(match_operand:SI 4 "s_register_operand" "")]))))]
    "TARGET_32BIT
@@ -51675,7 +56245,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         == (GET_MODE_MASK (GET_MODE (operands[5]))
             & (GET_MODE_MASK (GET_MODE (operands[5]))
  	      << (INTVAL (operands[2])))))"
-@@ -5360,7 +5696,7 @@
+@@ -5361,7 +5697,7 @@
    ""
  )
  
@@ -51684,7 +56254,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "extendhfdf2"
    [(set (match_operand:DF                  0 "general_operand" "")
  	(float_extend:DF (match_operand:HF 1 "general_operand"  "")))]
-@@ -5698,12 +6034,15 @@
+@@ -5490,7 +5826,7 @@
+   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, r, q, m")
+ 	(match_operand:DI 1 "di_operand"              "rDa,Db,Dc,mi,q"))]
+   "TARGET_32BIT
+-   && !(TARGET_HARD_FLOAT && TARGET_VFP)
++   && !(TARGET_HARD_FLOAT)
+    && !TARGET_IWMMXT
+    && (   register_operand (operands[0], DImode)
+        || register_operand (operands[1], DImode))"
+@@ -5699,12 +6035,15 @@
  ;; LO_SUM adds in the high bits.  Fortunately these are opaque operations
  ;; so this does not matter.
  (define_insn "*arm_movt"
@@ -51706,7 +56285,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (set_attr "predicable_short_it" "no")
     (set_attr "length" "4")
     (set_attr "type" "alu_sreg")]
-@@ -5725,6 +6064,7 @@
+@@ -5713,8 +6052,7 @@
+ (define_insn "*arm_movsi_insn"
+   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m")
+ 	(match_operand:SI 1 "general_operand"      "rk, I,K,j,mi,rk"))]
+-  "TARGET_ARM && ! TARGET_IWMMXT
+-   && !(TARGET_HARD_FLOAT && TARGET_VFP)
++  "TARGET_ARM && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
+    && (   register_operand (operands[0], SImode)
+        || register_operand (operands[1], SImode))"
+   "@
+@@ -5726,6 +6064,7 @@
     str%?\\t%1, %0"
    [(set_attr "type" "mov_reg,mov_imm,mvn_imm,mov_imm,load1,store1")
     (set_attr "predicable" "yes")
@@ -51714,7 +56303,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (set_attr "pool_range" "*,*,*,*,4096,*")
     (set_attr "neg_pool_range" "*,*,*,*,4084,*")]
  )
-@@ -5761,7 +6101,8 @@
+@@ -5762,7 +6101,8 @@
    [(set (match_operand:SI 0 "arm_general_register_operand" "")
  	(const:SI (plus:SI (match_operand:SI 1 "general_operand" "")
  			   (match_operand:SI 2 "const_int_operand" ""))))]
@@ -51724,7 +56313,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     && arm_disable_literal_pool
     && reload_completed
     && GET_CODE (operands[1]) == SYMBOL_REF"
-@@ -5792,8 +6133,7 @@
+@@ -5793,8 +6133,7 @@
  (define_split
    [(set (match_operand:SI 0 "arm_general_register_operand" "")
         (match_operand:SI 1 "general_operand" ""))]
@@ -51734,25 +56323,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     && !flag_pic && !target_word_relocations
     && !arm_tls_referenced_p (operands[1])"
    [(clobber (const_int 0))]
-@@ -6361,7 +6701,7 @@
+@@ -6362,7 +6701,7 @@
    [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m,r")
  	(match_operand:HI 1 "general_operand"      "rIk,K,n,r,mi"))]
    "TARGET_ARM
 -   && arm_arch4
-+   && arm_arch4 && !(TARGET_HARD_FLOAT && TARGET_VFP)
++   && arm_arch4 && !TARGET_HARD_FLOAT
     && (register_operand (operands[0], HImode)
         || register_operand (operands[1], HImode))"
    "@
-@@ -6387,7 +6727,7 @@
+@@ -6388,7 +6727,7 @@
  (define_insn "*movhi_bytes"
    [(set (match_operand:HI 0 "s_register_operand" "=r,r,r")
  	(match_operand:HI 1 "arm_rhs_operand"  "I,rk,K"))]
 -  "TARGET_ARM"
-+  "TARGET_ARM && !(TARGET_HARD_FLOAT && TARGET_VFP)"
++  "TARGET_ARM && !TARGET_HARD_FLOAT"
    "@
     mov%?\\t%0, %1\\t%@ movhi
     mov%?\\t%0, %1\\t%@ movhi
-@@ -6395,7 +6735,7 @@
+@@ -6396,7 +6735,7 @@
    [(set_attr "predicable" "yes")
     (set_attr "type" "mov_imm,mov_reg,mvn_imm")]
  )
@@ -51761,7 +56350,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ;; We use a DImode scratch because we may occasionally need an additional
  ;; temporary if the address isn't offsettable -- push_reload doesn't seem
  ;; to take any notice of the "o" constraints on reload_memory_operand operand.
-@@ -6517,7 +6857,7 @@
+@@ -6518,7 +6857,7 @@
     strb%?\\t%1, %0"
    [(set_attr "type" "mov_reg,mov_reg,mov_imm,mov_imm,mvn_imm,load1,store1,load1,store1")
     (set_attr "predicable" "yes")
@@ -51770,16 +56359,34 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (set_attr "arch" "t2,any,any,t2,any,t2,t2,any,any")
     (set_attr "length" "2,4,4,2,4,2,2,4,4")]
  )
-@@ -6547,7 +6887,7 @@
+@@ -6548,7 +6887,7 @@
  (define_insn "*arm32_movhf"
    [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,r,r")
  	(match_operand:HF 1 "general_operand"	   " m,r,r,F"))]
 -  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_FP16)
-+  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_VFP)
++  "TARGET_32BIT && !TARGET_HARD_FLOAT
     && (	  s_register_operand (operands[0], HFmode)
         || s_register_operand (operands[1], HFmode))"
    "*
-@@ -7365,6 +7705,24 @@
+@@ -6892,7 +7231,7 @@
+   [(set (pc) (if_then_else
+ 	      (match_operator 0 "expandable_comparison_operator"
+ 	       [(match_operand:SF 1 "s_register_operand" "")
+-	        (match_operand:SF 2 "arm_float_compare_operand" "")])
++	        (match_operand:SF 2 "vfp_compare_operand" "")])
+ 	      (label_ref (match_operand 3 "" ""))
+ 	      (pc)))]
+   "TARGET_32BIT && TARGET_HARD_FLOAT"
+@@ -6904,7 +7243,7 @@
+   [(set (pc) (if_then_else
+ 	      (match_operator 0 "expandable_comparison_operator"
+ 	       [(match_operand:DF 1 "s_register_operand" "")
+-	        (match_operand:DF 2 "arm_float_compare_operand" "")])
++	        (match_operand:DF 2 "vfp_compare_operand" "")])
+ 	      (label_ref (match_operand 3 "" ""))
+ 	      (pc)))]
+   "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
+@@ -7366,11 +7705,29 @@
    DONE;
  }")
  
@@ -51787,7 +56394,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  [(set (match_operand:SI 0 "s_register_operand")
 +	(match_operator:SI 1 "expandable_comparison_operator"
 +	 [(match_operand:HF 2 "s_register_operand")
-+	  (match_operand:HF 3 "arm_float_compare_operand")]))]
++	  (match_operand:HF 3 "vfp_compare_operand")]))]
 +  "TARGET_VFP_FP16INST"
 +  {
 +    if (!arm_validize_comparison (&operands[1],
@@ -51804,7 +56411,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "cstoresf4"
    [(set (match_operand:SI 0 "s_register_operand" "")
  	(match_operator:SI 1 "expandable_comparison_operator"
-@@ -7417,9 +7775,31 @@
+ 	 [(match_operand:SF 2 "s_register_operand" "")
+-	  (match_operand:SF 3 "arm_float_compare_operand" "")]))]
++	  (match_operand:SF 3 "vfp_compare_operand" "")]))]
+   "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "emit_insn (gen_cstore_cc (operands[0], operands[1],
+ 			     operands[2], operands[3])); DONE;"
+@@ -7380,7 +7737,7 @@
+   [(set (match_operand:SI 0 "s_register_operand" "")
+ 	(match_operator:SI 1 "expandable_comparison_operator"
+ 	 [(match_operand:DF 2 "s_register_operand" "")
+-	  (match_operand:DF 3 "arm_float_compare_operand" "")]))]
++	  (match_operand:DF 3 "vfp_compare_operand" "")]))]
+   "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
+   "emit_insn (gen_cstore_cc (operands[0], operands[1],
+ 			     operands[2], operands[3])); DONE;"
+@@ -7418,9 +7775,31 @@
      rtx ccreg;
  
      if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), 
@@ -51838,7 +56460,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      code = GET_CODE (operands[1]);
      ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0),
  				 XEXP (operands[1], 1), NULL_RTX);
-@@ -7438,7 +7818,7 @@
+@@ -7439,7 +7818,7 @@
      enum rtx_code code = GET_CODE (operands[1]);
      rtx ccreg;
  
@@ -51847,7 +56469,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         				  &XEXP (operands[1], 1)))
         FAIL;
  
-@@ -7503,6 +7883,37 @@
+@@ -7504,6 +7883,37 @@
     (set_attr "type" "fcsel")]
  )
  
@@ -51885,7 +56507,116 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn_and_split "*movsicc_insn"
    [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r")
  	(if_then_else:SI
-@@ -8152,8 +8563,8 @@
+@@ -7627,6 +8037,7 @@
+   "
+   {
+     rtx callee, pat;
++    tree addr = MEM_EXPR (operands[0]);
+     
+     /* In an untyped call, we can get NULL for operand 2.  */
+     if (operands[2] == NULL_RTX)
+@@ -7641,8 +8052,17 @@
+ 	: !REG_P (callee))
+       XEXP (operands[0], 0) = force_reg (Pmode, callee);
+ 
+-    pat = gen_call_internal (operands[0], operands[1], operands[2]);
+-    arm_emit_call_insn (pat, XEXP (operands[0], 0), false);
++    if (detect_cmse_nonsecure_call (addr))
++      {
++	pat = gen_nonsecure_call_internal (operands[0], operands[1],
++					   operands[2]);
++	emit_call_insn (pat);
++      }
++    else
++      {
++	pat = gen_call_internal (operands[0], operands[1], operands[2]);
++	arm_emit_call_insn (pat, XEXP (operands[0], 0), false);
++      }
+     DONE;
+   }"
+ )
+@@ -7653,6 +8073,24 @@
+ 	      (use (match_operand 2 "" ""))
+ 	      (clobber (reg:SI LR_REGNUM))])])
+ 
++(define_expand "nonsecure_call_internal"
++  [(parallel [(call (unspec:SI [(match_operand 0 "memory_operand" "")]
++			       UNSPEC_NONSECURE_MEM)
++		    (match_operand 1 "general_operand" ""))
++	      (use (match_operand 2 "" ""))
++	      (clobber (reg:SI LR_REGNUM))
++	      (clobber (reg:SI 4))])]
++  "use_cmse"
++  "
++  {
++    rtx tmp;
++    tmp = copy_to_suggested_reg (XEXP (operands[0], 0),
++				 gen_rtx_REG (SImode, 4),
++				 SImode);
++
++    operands[0] = replace_equiv_address (operands[0], tmp);
++  }")
++
+ (define_insn "*call_reg_armv5"
+   [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r"))
+          (match_operand 1 "" ""))
+@@ -7688,6 +8126,7 @@
+   "
+   {
+     rtx pat, callee;
++    tree addr = MEM_EXPR (operands[1]);
+     
+     /* In an untyped call, we can get NULL for operand 2.  */
+     if (operands[3] == 0)
+@@ -7702,9 +8141,18 @@
+ 	: !REG_P (callee))
+       XEXP (operands[1], 0) = force_reg (Pmode, callee);
+ 
+-    pat = gen_call_value_internal (operands[0], operands[1],
+-				   operands[2], operands[3]);
+-    arm_emit_call_insn (pat, XEXP (operands[1], 0), false);
++    if (detect_cmse_nonsecure_call (addr))
++      {
++	pat = gen_nonsecure_call_value_internal (operands[0], operands[1],
++						 operands[2], operands[3]);
++	emit_call_insn (pat);
++      }
++    else
++      {
++	pat = gen_call_value_internal (operands[0], operands[1],
++				       operands[2], operands[3]);
++	arm_emit_call_insn (pat, XEXP (operands[1], 0), false);
++      }
+     DONE;
+   }"
+ )
+@@ -7716,6 +8164,25 @@
+ 	      (use (match_operand 3 "" ""))
+ 	      (clobber (reg:SI LR_REGNUM))])])
+ 
++(define_expand "nonsecure_call_value_internal"
++  [(parallel [(set (match_operand       0 "" "")
++		   (call (unspec:SI [(match_operand 1 "memory_operand" "")]
++				    UNSPEC_NONSECURE_MEM)
++			 (match_operand 2 "general_operand" "")))
++	      (use (match_operand 3 "" ""))
++	      (clobber (reg:SI LR_REGNUM))
++	      (clobber (reg:SI 4))])]
++  "use_cmse"
++  "
++  {
++    rtx tmp;
++    tmp = copy_to_suggested_reg (XEXP (operands[1], 0),
++				 gen_rtx_REG (SImode, 4),
++				 SImode);
++
++    operands[1] = replace_equiv_address (operands[1], tmp);
++  }")
++
+ (define_insn "*call_value_reg_armv5"
+   [(set (match_operand 0 "" "")
+         (call (mem:SI (match_operand:SI 1 "s_register_operand" "r"))
+@@ -8153,8 +8620,8 @@
  )
  
  (define_insn "probe_stack"
@@ -51896,7 +56627,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "TARGET_32BIT"
    "str%?\\tr0, %0"
    [(set_attr "type" "store1")
-@@ -10220,8 +10631,8 @@
+@@ -10221,8 +10688,8 @@
  	 (match_operand 1 "const_int_operand" "")))
     (clobber (match_scratch:SI 2 ""))]
    "TARGET_ARM
@@ -51907,7 +56638,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set (match_dup 2) (zero_extend:SI (match_dup 0)))
     (set (reg:CC CC_REGNUM) (compare:CC (match_dup 2) (match_dup 1)))]
    "
-@@ -10561,7 +10972,11 @@
+@@ -10562,7 +11029,11 @@
    }
    "
    [(set_attr "type" "load4")
@@ -51920,7 +56651,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; Pop with return (as used in epilogue RTL)
-@@ -10590,7 +11005,10 @@
+@@ -10591,7 +11062,10 @@
    }
    "
    [(set_attr "type" "load4")
@@ -51932,7 +56663,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  (define_insn "*pop_multiple_with_return"
-@@ -10610,7 +11028,10 @@
+@@ -10611,7 +11085,10 @@
    }
    "
    [(set_attr "type" "load4")
@@ -51944,7 +56675,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  ;; Load into PC and return
-@@ -10821,19 +11242,22 @@
+@@ -10632,7 +11109,7 @@
+                    (match_operand:SI 2 "const_int_I_operand" "I")))
+      (set (match_operand:DF 3 "vfp_hard_register_operand" "")
+           (mem:DF (match_dup 1)))])]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "*
+   {
+     int num_regs = XVECLEN (operands[0], 0);
+@@ -10822,19 +11299,22 @@
     (set_attr "predicable_short_it" "no")
     (set_attr "type" "clz")])
  
@@ -51978,7 +56718,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  ;; V5E instructions.
  
-@@ -10957,13 +11381,16 @@
+@@ -10958,13 +11438,16 @@
  ;; We only care about the lower 16 bits of the constant 
  ;; being inserted into the upper 16 bits of the register.
  (define_insn "*arm_movtas_ze" 
@@ -51999,6 +56739,221 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    (set_attr "predicable_short_it" "no")
    (set_attr "length" "4")
    (set_attr "type" "alu_sreg")]
+--- a/src/gcc/config/arm/arm.opt
++++ b/src/gcc/config/arm/arm.opt
+@@ -109,6 +109,10 @@ mfloat-abi=
+ Target RejectNegative Joined Enum(float_abi_type) Var(arm_float_abi) Init(TARGET_DEFAULT_FLOAT_ABI)
+ Specify if floating point hardware should be used.
+ 
++mcmse
++Target RejectNegative Var(use_cmse)
++Specify that the compiler should target secure code as per ARMv8-M Security Extensions.
++
+ Enum
+ Name(float_abi_type) Type(enum float_abi_type)
+ Known floating-point ABIs (for use with the -mfloat-abi= option):
+--- /dev/null
++++ b/src/gcc/config/arm/arm_cmse.h
+@@ -0,0 +1,199 @@
++/* ARMv8-M Secure Extensions intrinsics include file.
++
++   Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by ARM Ltd.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published
++   by the Free Software Foundation; either version 3, or (at your
++   option) any later version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++
++#ifndef _GCC_ARM_CMSE_H
++#define _GCC_ARM_CMSE_H
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#if __ARM_FEATURE_CMSE & 1
++
++#include <stddef.h>
++#include <stdint.h>
++
++#ifdef __ARM_BIG_ENDIAN
++
++typedef union {
++  struct cmse_address_info {
++#if __ARM_FEATURE_CMSE & 2
++    unsigned idau_region:8;
++    unsigned idau_region_valid:1;
++    unsigned secure:1;
++    unsigned nonsecure_readwrite_ok:1;
++    unsigned nonsecure_read_ok:1;
++#else
++    unsigned :12;
++#endif
++    unsigned readwrite_ok:1;
++    unsigned read_ok:1;
++#if __ARM_FEATURE_CMSE & 2
++    unsigned sau_region_valid:1;
++#else
++    unsigned :1;
++#endif
++    unsigned mpu_region_valid:1;
++#if __ARM_FEATURE_CMSE & 2
++    unsigned sau_region:8;
++#else
++    unsigned :8;
++#endif
++    unsigned mpu_region:8;
++  } flags;
++  unsigned value;
++} cmse_address_info_t;
++
++#else
++
++typedef union {
++  struct cmse_address_info {
++    unsigned mpu_region:8;
++#if __ARM_FEATURE_CMSE & 2
++    unsigned sau_region:8;
++#else
++    unsigned :8;
++#endif
++    unsigned mpu_region_valid:1;
++#if __ARM_FEATURE_CMSE & 2
++    unsigned sau_region_valid:1;
++#else
++    unsigned :1;
++#endif
++    unsigned read_ok:1;
++    unsigned readwrite_ok:1;
++#if __ARM_FEATURE_CMSE & 2
++    unsigned nonsecure_read_ok:1;
++    unsigned nonsecure_readwrite_ok:1;
++    unsigned secure:1;
++    unsigned idau_region_valid:1;
++    unsigned idau_region:8;
++#else
++    unsigned :12;
++#endif
++  } flags;
++  unsigned value;
++} cmse_address_info_t;
++
++#endif /* __ARM_BIG_ENDIAN */
++
++#define cmse_TT_fptr(p) (__cmse_TT_fptr ((__cmse_fptr)(p)))
++
++typedef void (*__cmse_fptr)(void);
++
++#define __CMSE_TT_ASM(flags) \
++{ \
++  cmse_address_info_t __result; \
++   __asm__ ("tt" # flags " %0,%1" \
++	   : "=r"(__result) \
++	   : "r"(__p) \
++	   : "memory"); \
++  return __result; \
++}
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++__cmse_TT_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM ()
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++cmse_TT (void *__p)
++__CMSE_TT_ASM ()
++
++#define cmse_TTT_fptr(p) (__cmse_TTT_fptr ((__cmse_fptr)(p)))
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++__cmse_TTT_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM (t)
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++cmse_TTT (void *__p)
++__CMSE_TT_ASM (t)
++
++#if __ARM_FEATURE_CMSE & 2
++
++#define cmse_TTA_fptr(p) (__cmse_TTA_fptr ((__cmse_fptr)(p)))
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++__cmse_TTA_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM (a)
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++cmse_TTA (void *__p)
++__CMSE_TT_ASM (a)
++
++#define cmse_TTAT_fptr(p) (__cmse_TTAT_fptr ((__cmse_fptr)(p)))
++
++__extension__ static __inline cmse_address_info_t
++__attribute__ ((__always_inline__))
++__cmse_TTAT_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM (at)
++
++__extension__ static __inline cmse_address_info_t
++__attribute__ ((__always_inline__))
++cmse_TTAT (void *__p)
++__CMSE_TT_ASM (at)
++
++/* FIXME: diagnose use outside cmse_nonsecure_entry functions.  */
++__extension__ static __inline int __attribute__ ((__always_inline__))
++cmse_nonsecure_caller (void)
++{
++  return __builtin_arm_cmse_nonsecure_caller ();
++}
++
++#define CMSE_AU_NONSECURE	2
++#define CMSE_MPU_NONSECURE	16
++#define CMSE_NONSECURE		18
++
++#define cmse_nsfptr_create(p) ((typeof ((p))) ((intptr_t) (p) & ~1))
++
++#define cmse_is_nsfptr(p) (!((intptr_t) (p) & 1))
++
++#endif /* __ARM_FEATURE_CMSE & 2 */
++
++#define CMSE_MPU_UNPRIV		4
++#define CMSE_MPU_READWRITE	1
++#define CMSE_MPU_READ		8
++
++__extension__ void *
++cmse_check_address_range (void *, size_t, int);
++
++#define cmse_check_pointed_object(p, f) \
++  ((typeof ((p))) cmse_check_address_range ((p), sizeof (*(p)), (f)))
++
++#endif /* __ARM_FEATURE_CMSE & 1 */
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif /* _GCC_ARM_CMSE_H */
 --- /dev/null
 +++ b/src/gcc/config/arm/arm_fp16.h
 @@ -0,0 +1,255 @@
@@ -52347,7 +57302,58 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
  vabd_s8 (int8x8_t __a, int8x8_t __b)
  {
-@@ -14830,6 +14843,855 @@ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
+@@ -2943,6 +2956,34 @@ vmaxq_f32 (float32x4_t __a, float32x4_t __b)
+   return (float32x4_t)__builtin_neon_vmaxfv4sf (__a, __b);
+ }
+ 
++#pragma GCC push_options
++#pragma GCC target ("fpu=neon-fp-armv8")
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vmaxnm_f32 (float32x2_t a, float32x2_t b)
++{
++  return (float32x2_t)__builtin_neon_vmaxnmv2sf (a, b);
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vmaxnmq_f32 (float32x4_t a, float32x4_t b)
++{
++  return (float32x4_t)__builtin_neon_vmaxnmv4sf (a, b);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vminnm_f32 (float32x2_t a, float32x2_t b)
++{
++  return (float32x2_t)__builtin_neon_vminnmv2sf (a, b);
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vminnmq_f32 (float32x4_t a, float32x4_t b)
++{
++  return (float32x4_t)__builtin_neon_vminnmv4sf (a, b);
++}
++#pragma GCC pop_options
++
++
+ __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+ vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+@@ -5370,6 +5411,15 @@ vget_lane_s64 (int64x1_t __a, const int __b)
+   return (int64_t)__builtin_neon_vget_lanedi (__a, __b);
+ }
+ 
++#pragma GCC push_options
++#pragma GCC target ("fpu=crypto-neon-fp-armv8")
++__extension__ static __inline poly64_t __attribute__ ((__always_inline__))
++vget_lane_p64 (poly64x1_t __a, const int __b)
++{
++  return (poly64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
++}
++
++#pragma GCC pop_options
+ __extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+ vget_lane_u64 (uint64x1_t __a, const int __b)
+ {
+@@ -14830,6 +14880,855 @@ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
  
  #pragma GCC pop_options
  
@@ -53270,12 +58276,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  VAR6 (BINOP, vmaxu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
  VAR2 (BINOP, vmaxf, v2sf, v4sf)
 +VAR2 (BINOP, vmaxf, v8hf, v4hf)
-+VAR2 (BINOP, vmaxnm, v4hf, v8hf)
++VAR4 (BINOP, vmaxnm, v2sf, v4sf, v4hf, v8hf)
  VAR6 (BINOP, vmins, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
  VAR6 (BINOP, vminu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
  VAR2 (BINOP, vminf, v2sf, v4sf)
 +VAR2 (BINOP, vminf, v4hf, v8hf)
-+VAR2 (BINOP, vminnm, v8hf, v4hf)
++VAR4 (BINOP, vminnm, v2sf, v4sf, v8hf, v4hf)
  
  VAR3 (BINOP, vpmaxs, v8qi, v4hi, v2si)
  VAR3 (BINOP, vpmaxu, v8qi, v4hi, v2si)
@@ -53465,9 +58471,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     |march=armv8.1-a+crc					\
 +   |march=armv8.2-a					\
 +   |march=armv8.2-a+fp16				\
-+   |march=armv8-m.base					\
++   |march=armv8-m.base|mcpu=cortex-m23			\
 +   |march=armv8-m.main					\
-+   |march=armv8-m.main+dsp				\
++   |march=armv8-m.main+dsp|mcpu=cortex-m33		\
     :%{!r:--be8}}}"
  #else
  #define BE8_LINK_SPEC \
@@ -53487,15 +58493,30 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     |march=armv8.1-a+crc					\
 +   |march=armv8.2-a					\
 +   |march=armv8.2-a+fp16				\
-+   |march=armv8-m.base					\
++   |march=armv8-m.base|mcpu=cortex-m23			\
 +   |march=armv8-m.main					\
-+   |march=armv8-m.main+dsp				\
++   |march=armv8-m.main+dsp|mcpu=cortex-m33		\
     :%{!r:--be8}}}"
  #endif
  
 --- a/src/gcc/config/arm/constraints.md
 +++ b/src/gcc/config/arm/constraints.md
-@@ -66,7 +66,7 @@
+@@ -34,11 +34,13 @@
+ ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, Dl, DL, Do, Dv, Dy, Di, Dt, Dp, Dz
+ ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
+ ;; in Thumb-2 state: Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py
++;; in all states: Pf
+ 
+ ;; The following memory constraints have been used:
+-;; in ARM/Thumb-2 state: Q, Uh, Ut, Uv, Uy, Un, Um, Us
++;; in ARM/Thumb-2 state: Uh, Ut, Uv, Uy, Un, Um, Us
+ ;; in ARM state: Uq
+ ;; in Thumb state: Uu, Uw
++;; in all states: Q
+ 
+ 
+ (define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS"
+@@ -66,7 +68,7 @@
  
  (define_constraint "j"
   "A constant suitable for a MOVW instruction. (ARM/Thumb-2)"
@@ -53504,6 +58525,45 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        (ior (and (match_code "high")
  		(match_test "arm_valid_symbolic_address_p (XEXP (op, 0))"))
  	   (and (match_code "const_int")
+@@ -180,6 +182,13 @@
+   (and (match_code "const_int")
+        (match_test "TARGET_THUMB1 && ival >= 256 && ival <= 510")))
+ 
++(define_constraint "Pf"
++  "Memory models except relaxed, consume or release ones."
++  (and (match_code "const_int")
++       (match_test "!is_mm_relaxed (memmodel_from_int (ival))
++		    && !is_mm_consume (memmodel_from_int (ival))
++		    && !is_mm_release (memmodel_from_int (ival))")))
++
+ (define_constraint "Ps"
+   "@internal In Thumb-2 state a constant in the range -255 to +255"
+   (and (match_code "const_int")
+@@ -333,13 +342,13 @@
+  "@internal
+   In ARM/ Thumb2 a const_double which can be used with a vcvt.f32.s32 with fract bits operation"
+   (and (match_code "const_double")
+-       (match_test "TARGET_32BIT && TARGET_VFP && vfp3_const_double_for_fract_bits (op)")))
++       (match_test "TARGET_32BIT && vfp3_const_double_for_fract_bits (op)")))
+ 
+ (define_constraint "Dp"
+  "@internal
+   In ARM/ Thumb2 a const_double which can be used with a vcvt.s32.f32 with bits operation"
+   (and (match_code "const_double")
+-       (match_test "TARGET_32BIT && TARGET_VFP
++       (match_test "TARGET_32BIT
+ 		    && vfp3_const_double_for_bits (op) > 0")))
+ 
+ (define_register_constraint "Ts" "(arm_restrict_it) ? LO_REGS : GENERAL_REGS"
+@@ -407,7 +416,7 @@
+ 
+ (define_memory_constraint "Q"
+  "@internal
+-  In ARM/Thumb-2 state an address that is a single base register."
++  An address that is a single base register."
+  (and (match_code "mem")
+       (match_test "REG_P (XEXP (op, 0))")))
+ 
 --- a/src/gcc/config/arm/cortex-a53.md
 +++ b/src/gcc/config/arm/cortex-a53.md
 @@ -30,6 +30,7 @@
@@ -53813,7 +58873,25 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      {"0xc20", "armv6-m", "cortex-m0"},
 --- a/src/gcc/config/arm/elf.h
 +++ b/src/gcc/config/arm/elf.h
-@@ -148,8 +148,9 @@
+@@ -75,16 +75,7 @@
+ 
+ /* We might need a ARM specific header to function declarations.  */
+ #undef  ASM_DECLARE_FUNCTION_NAME
+-#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL)		\
+-  do								\
+-    {								\
+-      ARM_DECLARE_FUNCTION_NAME (FILE, NAME, DECL);		\
+-      ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "function");	\
+-      ASM_DECLARE_RESULT (FILE, DECL_RESULT (DECL));		\
+-      ASM_OUTPUT_LABEL(FILE, NAME);				\
+-      ARM_OUTPUT_FN_UNWIND (FILE, TRUE);			\
+-    }								\
+-  while (0)
++#define ASM_DECLARE_FUNCTION_NAME arm_asm_declare_function_name
+ 
+ /* We might need an ARM specific trailer for function declarations.  */
+ #undef  ASM_DECLARE_FUNCTION_SIZE
+@@ -148,8 +139,9 @@
    while (0)
  
  /* Horrible hack: We want to prevent some libgcc routines being included
@@ -53827,6 +58905,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #undef L_truncdfsf2
 --- a/src/gcc/config/arm/iterators.md
 +++ b/src/gcc/config/arm/iterators.md
+@@ -46,7 +46,7 @@
+ (define_mode_iterator SIDI [SI DI])
+ 
+ ;; A list of modes which the VFP unit can handle
+-(define_mode_iterator SDF [(SF "TARGET_VFP") (DF "TARGET_VFP_DOUBLE")])
++(define_mode_iterator SDF [(SF "") (DF "TARGET_VFP_DOUBLE")])
+ 
+ ;; Integer element sizes implemented by IWMMXT.
+ (define_mode_iterator VMMX [V2SI V4HI V8QI])
 @@ -119,6 +119,10 @@
  ;; All supported vector modes (except those with 64-bit integer elements).
  (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
@@ -54688,25 +59775,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*umin<mode>3_neon"
    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
  	(umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
-@@ -1082,7 +1180,7 @@
-       }
-     else
-       {
--	if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1
-+	if (operands[2] == CONST1_RTX (SImode)
- 	    && (!reg_overlap_mentioned_p (operands[0], operands[1])
- 		|| REGNO (operands[0]) == REGNO (operands[1])))
- 	  /* This clobbers CC.  */
-@@ -1184,7 +1282,7 @@
-       }
-     else
-       {
--	if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1
-+	if (operands[2] == CONST1_RTX (SImode)
- 	    && (!reg_overlap_mentioned_p (operands[0], operands[1])
- 		|| REGNO (operands[0]) == REGNO (operands[1])))
- 	  /* This clobbers CC.  */
-@@ -1204,16 +1302,133 @@
+@@ -1208,16 +1306,133 @@
  
  ;; Widening operations
  
@@ -54843,7 +59912,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "widen_usum<mode>3"
    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
  	(plus:<V_widen> (zero_extend:<V_widen>
-@@ -1484,6 +1699,17 @@
+@@ -1488,6 +1703,17 @@
                      (const_string "neon_reduc_add<q>")))]
  )
  
@@ -54861,7 +59930,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vpsmin<mode>"
    [(set (match_operand:VD 0 "s_register_operand" "=w")
  	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
-@@ -1832,6 +2058,26 @@
+@@ -1836,6 +2062,26 @@
    DONE;
  })
  
@@ -54888,7 +59957,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ; Note that NEON operations don't support the full IEEE 754 standard: in
  ; particular, denormal values are flushed to zero.  This means that GCC cannot
  ; use those instructions for autovectorization, etc. unless
-@@ -1923,6 +2169,17 @@
+@@ -1927,6 +2173,17 @@
                      (const_string "neon_mul_<V_elem_ch><q>")))]
  )
  
@@ -54906,7 +59975,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "neon_vmla<mode>"
    [(match_operand:VDQW 0 "s_register_operand" "=w")
     (match_operand:VDQW 1 "s_register_operand" "0")
-@@ -1951,6 +2208,18 @@
+@@ -1955,6 +2212,18 @@
    DONE;
  })
  
@@ -54925,7 +59994,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "neon_vfms<VCVTF:mode>"
    [(match_operand:VCVTF 0 "s_register_operand")
     (match_operand:VCVTF 1 "s_register_operand")
-@@ -1963,6 +2232,18 @@
+@@ -1967,6 +2236,18 @@
    DONE;
  })
  
@@ -54944,7 +60013,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ; Used for intrinsics when flag_unsafe_math_optimizations is false.
  
  (define_insn "neon_vmla<mode>_unspec"
-@@ -2263,6 +2544,72 @@
+@@ -2267,6 +2548,72 @@
    [(set_attr "type" "neon_fp_compare_s<q>")]
  )
  
@@ -55017,7 +60086,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vc<cmp_op>u<mode>"
    [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
          (neg:<V_cmp_result>
-@@ -2314,6 +2661,60 @@
+@@ -2318,6 +2665,60 @@
    [(set_attr "type" "neon_fp_compare_s<q>")]
  )
  
@@ -55078,7 +60147,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vtst<mode>"
    [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
          (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
-@@ -2334,6 +2735,16 @@
+@@ -2338,6 +2739,16 @@
    [(set_attr "type" "neon_abd<q>")]
  )
  
@@ -55095,7 +60164,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vabdf<mode>"
    [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
          (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
-@@ -2396,6 +2807,40 @@
+@@ -2400,6 +2811,51 @@
    [(set_attr "type" "neon_fp_minmax_s<q>")]
  )
  
@@ -55133,10 +60202,21 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 + [(set_attr "type" "neon_fp_minmax_s<q>")]
 +)
 +
++;; v<maxmin>nm intrinsics.
++(define_insn "neon_<fmaxmin_op><mode>"
++  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
++	(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
++		       (match_operand:VCVTF 2 "s_register_operand" "w")]
++		       VMAXMINFNM))]
++  "TARGET_NEON && TARGET_FPU_ARMV8"
++  "<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
++  [(set_attr "type" "neon_fp_minmax_s<q>")]
++)
++
  ;; Vector forms for the IEEE-754 fmax()/fmin() functions
  (define_insn "<fmaxmin><mode>3"
    [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
-@@ -2467,6 +2912,17 @@
+@@ -2471,6 +2927,17 @@
    [(set_attr "type" "neon_fp_recps_s<q>")]
  )
  
@@ -55154,7 +60234,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vrsqrts<mode>"
    [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
          (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
-@@ -2477,6 +2933,17 @@
+@@ -2481,6 +2948,17 @@
    [(set_attr "type" "neon_fp_rsqrts_s<q>")]
  )
  
@@ -55172,7 +60252,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "neon_vabs<mode>"
    [(match_operand:VDQW 0 "s_register_operand" "")
     (match_operand:VDQW 1 "s_register_operand" "")]
-@@ -2592,6 +3059,15 @@
+@@ -2596,6 +3074,15 @@
  })
  
  (define_insn "neon_vrecpe<mode>"
@@ -55188,7 +60268,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set (match_operand:V32 0 "s_register_operand" "=w")
  	(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")]
                      UNSPEC_VRECPE))]
-@@ -2928,6 +3404,28 @@ if (BYTES_BIG_ENDIAN)
+@@ -2932,6 +3419,28 @@ if (BYTES_BIG_ENDIAN)
    [(set_attr "type" "neon_dup<q>")]
  )
  
@@ -55217,7 +60297,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "neon_vdup_lane<mode>"
    [(match_operand:VDQW 0 "s_register_operand" "=w")
     (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
-@@ -2947,6 +3445,25 @@ if (BYTES_BIG_ENDIAN)
+@@ -2951,6 +3460,25 @@ if (BYTES_BIG_ENDIAN)
      DONE;
  })
  
@@ -55243,7 +60323,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ; Scalar index is ignored, since only zero is valid here.
  (define_expand "neon_vdup_lanedi"
    [(match_operand:DI 0 "s_register_operand" "=w")
-@@ -3093,6 +3610,28 @@ if (BYTES_BIG_ENDIAN)
+@@ -3097,6 +3625,28 @@ if (BYTES_BIG_ENDIAN)
    [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
  )
  
@@ -55272,7 +60352,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vcvt<sup>_n<mode>"
    [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
  	(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
-@@ -3107,6 +3646,20 @@ if (BYTES_BIG_ENDIAN)
+@@ -3111,6 +3661,20 @@ if (BYTES_BIG_ENDIAN)
  )
  
  (define_insn "neon_vcvt<sup>_n<mode>"
@@ -55293,7 +60373,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
  	(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
  			   (match_operand:SI 2 "immediate_operand" "i")]
-@@ -3119,6 +3672,31 @@ if (BYTES_BIG_ENDIAN)
+@@ -3123,6 +3687,31 @@ if (BYTES_BIG_ENDIAN)
    [(set_attr "type" "neon_int_to_fp_<V_elem_ch><q>")]
  )
  
@@ -55325,7 +60405,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vmovn<mode>"
    [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
  	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")]
-@@ -3189,6 +3767,18 @@ if (BYTES_BIG_ENDIAN)
+@@ -3193,6 +3782,18 @@ if (BYTES_BIG_ENDIAN)
                     (const_string "neon_mul_<V_elem_ch>_scalar<q>")))]
  )
  
@@ -55344,7 +60424,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "neon_vmull<sup>_lane<mode>"
    [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
  	(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
-@@ -3443,6 +4033,19 @@ if (BYTES_BIG_ENDIAN)
+@@ -3447,6 +4048,19 @@ if (BYTES_BIG_ENDIAN)
    DONE;
  })
  
@@ -55364,7 +60444,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_expand "neon_vmulls_n<mode>"
    [(match_operand:<V_widen> 0 "s_register_operand" "")
     (match_operand:VMDI 1 "s_register_operand" "")
-@@ -4164,25 +4767,25 @@ if (BYTES_BIG_ENDIAN)
+@@ -4168,25 +4782,25 @@ if (BYTES_BIG_ENDIAN)
  
  (define_expand "neon_vtrn<mode>_internal"
    [(parallel
@@ -55402,7 +60482,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "TARGET_NEON"
    "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
    [(set_attr "type" "neon_permute<q>")]
-@@ -4190,25 +4793,25 @@ if (BYTES_BIG_ENDIAN)
+@@ -4194,25 +4808,25 @@ if (BYTES_BIG_ENDIAN)
  
  (define_expand "neon_vzip<mode>_internal"
    [(parallel
@@ -55441,7 +60521,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "TARGET_NEON"
    "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
    [(set_attr "type" "neon_zip<q>")]
-@@ -4216,25 +4819,25 @@ if (BYTES_BIG_ENDIAN)
+@@ -4220,25 +4834,25 @@ if (BYTES_BIG_ENDIAN)
  
  (define_expand "neon_vuzp<mode>_internal"
    [(parallel
@@ -57861,7 +62941,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  (define_predicate "ldrd_strd_offset_operand"
    (and (match_operand 0 "const_int_operand")
-@@ -285,19 +283,19 @@
+@@ -243,11 +241,6 @@
+        (and (match_code "const_double")
+ 	    (match_test "arm_const_double_rtx (op)"))))
+ 
+-(define_predicate "arm_float_compare_operand"
+-  (if_then_else (match_test "TARGET_VFP")
+-		(match_operand 0 "vfp_compare_operand")
+-		(match_operand 0 "s_register_operand")))
+-
+ ;; True for valid index operands.
+ (define_predicate "index_operand"
+   (ior (match_operand 0 "s_register_operand")
+@@ -285,19 +278,19 @@
  		      (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
  		 (and (match_code "rotate")
  		      (match_test "CONST_INT_P (XEXP (op, 1))
@@ -57885,7 +62977,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         (match_test "mode == GET_MODE (op)")))
  
  ;; True for shift operators which can be used with saturation instructions.
-@@ -306,7 +304,7 @@
+@@ -306,7 +299,7 @@
                   (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
              (and (match_code "ashift,ashiftrt")
                   (match_test "CONST_INT_P (XEXP (op, 1))
@@ -57894,7 +62986,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
         (match_test "mode == GET_MODE (op)")))
  
  ;; True for MULT, to identify which variant of shift_operator is in use.
-@@ -532,7 +530,7 @@
+@@ -398,6 +391,12 @@
+ 	     || mode == CC_DGTUmode));
+ })
+ 
++;; Any register, including CC
++(define_predicate "cc_register_operand"
++  (and (match_code "reg")
++       (ior (match_operand 0 "s_register_operand")
++	    (match_operand 0 "cc_register"))))
++
+ (define_special_predicate "arm_extendqisi_mem_op"
+   (and (match_operand 0 "memory_operand")
+        (match_test "TARGET_ARM ? arm_legitimate_address_outer_p (mode,
+@@ -532,7 +531,7 @@
    (ior (and (match_code "reg,subreg")
  	    (match_operand 0 "s_register_operand"))
         (and (match_code "const_int")
@@ -57903,7 +63008,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  (define_predicate "thumb1_cmpneg_operand"
    (and (match_code "const_int")
-@@ -612,59 +610,13 @@
+@@ -612,69 +611,23 @@
  (define_special_predicate "vect_par_constant_high" 
    (match_code "parallel")
  {
@@ -57965,9 +63070,94 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  })
  
  (define_predicate "const_double_vcvt_power_of_two_reciprocal"
+   (and (match_code "const_double")
+-       (match_test "TARGET_32BIT && TARGET_VFP
+-                   && vfp3_const_double_for_fract_bits (op)")))
++       (match_test "TARGET_32BIT
++		    && vfp3_const_double_for_fract_bits (op)")))
+ 
+ (define_predicate "const_double_vcvt_power_of_two"
+   (and (match_code "const_double")
+-       (match_test "TARGET_32BIT && TARGET_VFP
++       (match_test "TARGET_32BIT
+ 		    && vfp3_const_double_for_bits (op) > 0")))
+ 
+ (define_predicate "neon_struct_operand"
 --- a/src/gcc/config/arm/sync.md
 +++ b/src/gcc/config/arm/sync.md
-@@ -117,7 +117,7 @@
+@@ -63,37 +63,59 @@
+    (set_attr "predicable" "no")])
+ 
+ (define_insn "atomic_load<mode>"
+-  [(set (match_operand:QHSI 0 "register_operand" "=r")
++  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
+     (unspec_volatile:QHSI
+-      [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q")
+-       (match_operand:SI 2 "const_int_operand")]		;; model
++      [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
++       (match_operand:SI 2 "const_int_operand" "n,Pf,n")]	;; model
+       VUNSPEC_LDA))]
+   "TARGET_HAVE_LDACQ"
+   {
+     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
+-      return \"ldr<sync_sfx>%?\\t%0, %1\";
++      {
++	if (TARGET_THUMB1)
++	  return \"ldr<sync_sfx>\\t%0, %1\";
++	else
++	  return \"ldr<sync_sfx>%?\\t%0, %1\";
++      }
+     else
+-      return \"lda<sync_sfx>%?\\t%0, %1\";
++      {
++	if (TARGET_THUMB1)
++	  return \"lda<sync_sfx>\\t%0, %1\";
++	else
++	  return \"lda<sync_sfx>%?\\t%0, %1\";
++      }
+   }
+-  [(set_attr "predicable" "yes")
++  [(set_attr "arch" "32,v8mb,any")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+ 
+ (define_insn "atomic_store<mode>"
+-  [(set (match_operand:QHSI 0 "memory_operand" "=Q")
++  [(set (match_operand:QHSI 0 "memory_operand" "=Q,Q,Q")
+     (unspec_volatile:QHSI
+-      [(match_operand:QHSI 1 "general_operand" "r")
+-       (match_operand:SI 2 "const_int_operand")]		;; model
++      [(match_operand:QHSI 1 "general_operand" "r,r,l")
++       (match_operand:SI 2 "const_int_operand" "n,Pf,n")]	;; model
+       VUNSPEC_STL))]
+   "TARGET_HAVE_LDACQ"
+   {
+     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+-      return \"str<sync_sfx>%?\t%1, %0\";
++      {
++	if (TARGET_THUMB1)
++	  return \"str<sync_sfx>\t%1, %0\";
++	else
++	  return \"str<sync_sfx>%?\t%1, %0\";
++      }
+     else
+-      return \"stl<sync_sfx>%?\t%1, %0\";
++      {
++	if (TARGET_THUMB1)
++	  return \"stl<sync_sfx>\t%1, %0\";
++	else
++	  return \"stl<sync_sfx>%?\t%1, %0\";
++      }
+   }
+-  [(set_attr "predicable" "yes")
++  [(set_attr "arch" "32,v8mb,any")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+ 
+ ;; An LDRD instruction usable by the atomic_loaddi expander on LPAE targets
+@@ -117,7 +139,7 @@
    [(match_operand:DI 0 "s_register_operand")		;; val out
     (match_operand:DI 1 "mem_noofs_operand")		;; memory
     (match_operand:SI 2 "const_int_operand")]		;; model
@@ -57976,7 +63166,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     && ARM_DOUBLEWORD_ALIGN"
  {
    memmodel model = memmodel_from_int (INTVAL (operands[2]));
-@@ -125,7 +125,7 @@
+@@ -125,7 +147,7 @@
    /* For ARMv8-A we can use an LDAEXD to atomically load two 32-bit registers
       when acquire or stronger semantics are needed.  When the relaxed model is
       used this can be relaxed to a normal LDRD.  */
@@ -57985,7 +63175,392 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      {
        if (is_mm_relaxed (model))
  	emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1]));
-@@ -436,7 +436,7 @@
+@@ -167,21 +189,23 @@
+   DONE;
+ })
+ 
++;; Constraints of this pattern must be at least as strict as those of the
++;; cbranchsi operations in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_compare_and_swap<mode>_1"
+-  [(set (reg:CC_Z CC_REGNUM)					;; bool out
++  [(set (match_operand 0 "cc_register_operand" "=&c,&l,&l,&l")		;; bool out
+ 	(unspec_volatile:CC_Z [(const_int 0)] VUNSPEC_ATOMIC_CAS))
+-   (set (match_operand:SI 0 "s_register_operand" "=&r")		;; val out
++   (set (match_operand:SI 1 "s_register_operand" "=&r,&l,&0,&l*h")	;; val out
+ 	(zero_extend:SI
+-	  (match_operand:NARROW 1 "mem_noofs_operand" "+Ua")))	;; memory
+-   (set (match_dup 1)
++	  (match_operand:NARROW 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua")))	;; memory
++   (set (match_dup 2)
+ 	(unspec_volatile:NARROW
+-	  [(match_operand:SI 2 "arm_add_operand" "rIL")		;; expected
+-	   (match_operand:NARROW 3 "s_register_operand" "r")	;; desired
+-	   (match_operand:SI 4 "const_int_operand")		;; is_weak
+-	   (match_operand:SI 5 "const_int_operand")		;; mod_s
+-	   (match_operand:SI 6 "const_int_operand")]		;; mod_f
++	  [(match_operand:SI 3 "arm_add_operand" "rIL,lIL*h,J,*r")	;; expected
++	   (match_operand:NARROW 4 "s_register_operand" "r,r,r,r")	;; desired
++	   (match_operand:SI 5 "const_int_operand")		;; is_weak
++	   (match_operand:SI 6 "const_int_operand")		;; mod_s
++	   (match_operand:SI 7 "const_int_operand")]		;; mod_f
+ 	  VUNSPEC_ATOMIC_CAS))
+-   (clobber (match_scratch:SI 7 "=&r"))]
++   (clobber (match_scratch:SI 8 "=&r,X,X,X"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -189,27 +213,30 @@
+   {
+     arm_split_compare_and_swap (operands);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
+ 
+ (define_mode_attr cas_cmp_operand
+   [(SI "arm_add_operand") (DI "cmpdi_operand")])
+ (define_mode_attr cas_cmp_str
+   [(SI "rIL") (DI "rDi")])
+ 
++;; Constraints of this pattern must be at least as strict as those of the
++;; cbranchsi operations in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_compare_and_swap<mode>_1"
+-  [(set (reg:CC_Z CC_REGNUM)					;; bool out
++  [(set (match_operand 0 "cc_register_operand" "=&c,&l,&l,&l")		;; bool out
+ 	(unspec_volatile:CC_Z [(const_int 0)] VUNSPEC_ATOMIC_CAS))
+-   (set (match_operand:SIDI 0 "s_register_operand" "=&r")	;; val out
+-	(match_operand:SIDI 1 "mem_noofs_operand" "+Ua"))	;; memory
+-   (set (match_dup 1)
++   (set (match_operand:SIDI 1 "s_register_operand" "=&r,&l,&0,&l*h")	;; val out
++	(match_operand:SIDI 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))	;; memory
++   (set (match_dup 2)
+ 	(unspec_volatile:SIDI
+-	  [(match_operand:SIDI 2 "<cas_cmp_operand>" "<cas_cmp_str>") ;; expect
+-	   (match_operand:SIDI 3 "s_register_operand" "r")	;; desired
+-	   (match_operand:SI 4 "const_int_operand")		;; is_weak
+-	   (match_operand:SI 5 "const_int_operand")		;; mod_s
+-	   (match_operand:SI 6 "const_int_operand")]		;; mod_f
++	  [(match_operand:SIDI 3 "<cas_cmp_operand>" "<cas_cmp_str>,lIL*h,J,*r") ;; expect
++	   (match_operand:SIDI 4 "s_register_operand" "r,r,r,r")	;; desired
++	   (match_operand:SI 5 "const_int_operand")		;; is_weak
++	   (match_operand:SI 6 "const_int_operand")		;; mod_s
++	   (match_operand:SI 7 "const_int_operand")]		;; mod_f
+ 	  VUNSPEC_ATOMIC_CAS))
+-   (clobber (match_scratch:SI 7 "=&r"))]
++   (clobber (match_scratch:SI 8 "=&r,X,X,X"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -217,18 +244,19 @@
+   {
+     arm_split_compare_and_swap (operands);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
+ 
+ (define_insn_and_split "atomic_exchange<mode>"
+-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")	;; output
+-	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))	;; memory
++  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r")	;; output
++	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua"))	;; memory
+    (set (match_dup 1)
+ 	(unspec_volatile:QHSD
+-	  [(match_operand:QHSD 2 "s_register_operand" "r")	;; input
++	  [(match_operand:QHSD 2 "s_register_operand" "r,r")	;; input
+ 	   (match_operand:SI 3 "const_int_operand" "")]		;; model
+ 	  VUNSPEC_ATOMIC_XCHG))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:SI 4 "=&r"))]
++   (clobber (match_scratch:SI 4 "=&r,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -237,7 +265,11 @@
+     arm_split_atomic_op (SET, operands[0], NULL, operands[1],
+ 			 operands[2], operands[3], operands[4]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb")])
++
++;; The following mode and code attribute are defined here because they are
++;; specific to atomics and are not needed anywhere else.
+ 
+ (define_mode_attr atomic_op_operand
+   [(QI "reg_or_int_operand")
+@@ -248,16 +280,24 @@
+ (define_mode_attr atomic_op_str
+   [(QI "rn") (HI "rn") (SI "rn") (DI "r")])
+ 
++(define_code_attr thumb1_atomic_op_str
++  [(ior "l,l") (xor "l,l") (and "l,l") (plus "lIJL,r") (minus "lPd,lPd")])
++
++(define_code_attr thumb1_atomic_newop_str
++  [(ior "&l,&l") (xor "&l,&l") (and "&l,&l") (plus "&l,&r") (minus "&l,&l")])
++
++;; Constraints of this pattern must be at least as strict as those of the non
++;; atomic operations in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_<sync_optab><mode>"
+-  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua")
++  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua,Ua,Ua")
+ 	(unspec_volatile:QHSD
+ 	  [(syncop:QHSD (match_dup 0)
+-	     (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>"))
++	     (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_op_str>"))
+ 	   (match_operand:SI 2 "const_int_operand")]		;; model
+ 	  VUNSPEC_ATOMIC_OP))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:QHSD 3 "=&r"))
+-   (clobber (match_scratch:SI 4 "=&r"))]
++   (clobber (match_scratch:QHSD 3 "=&r,<thumb1_atomic_newop_str>"))
++   (clobber (match_scratch:SI 4 "=&r,&l,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -266,19 +306,22 @@
+     arm_split_atomic_op (<CODE>, NULL, operands[3], operands[0],
+ 			 operands[1], operands[2], operands[4]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb,v8mb")])
+ 
++;; Constraints of this pattern must be at least as strict as those of the non
++;; atomic NANDs in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_nand<mode>"
+-  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua")
++  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua,Ua")
+ 	(unspec_volatile:QHSD
+ 	  [(not:QHSD
+ 	     (and:QHSD (match_dup 0)
+-	       (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>")))
++	       (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>,l")))
+ 	   (match_operand:SI 2 "const_int_operand")]		;; model
+ 	  VUNSPEC_ATOMIC_OP))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:QHSD 3 "=&r"))
+-   (clobber (match_scratch:SI 4 "=&r"))]
++   (clobber (match_scratch:QHSD 3 "=&r,&l"))
++   (clobber (match_scratch:SI 4 "=&r,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -287,20 +330,38 @@
+     arm_split_atomic_op (NOT, NULL, operands[3], operands[0],
+ 			 operands[1], operands[2], operands[4]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb")])
++
++;; 3 alternatives are needed to represent constraints after split from
++;; thumb1_addsi3: (i) case where operand1 and destination can be in different
++;; registers, (ii) case where they are in the same low register and (iii) case
++;; when they are in the same register without restriction on the register.  We
++;; disparage slightly alternatives that require copying the old value into the
++;; register for the new value (see bind_old_new in arm_split_atomic_op).
++(define_code_attr thumb1_atomic_fetch_op_str
++  [(ior "l,l,l") (xor "l,l,l") (and "l,l,l") (plus "lL,?IJ,?r") (minus "lPd,lPd,lPd")])
++
++(define_code_attr thumb1_atomic_fetch_newop_str
++  [(ior "&l,&l,&l") (xor "&l,&l,&l") (and "&l,&l,&l") (plus "&l,&l,&r") (minus "&l,&l,&l")])
+ 
++(define_code_attr thumb1_atomic_fetch_oldop_str
++  [(ior "&r,&r,&r") (xor "&r,&r,&r") (and "&r,&r,&r") (plus "&l,&r,&r") (minus "&l,&l,&l")])
++
++;; Constraints of this pattern must be at least as strict as those of the non
++;; atomic operations in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_fetch_<sync_optab><mode>"
+-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
+-	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))
++  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,<thumb1_atomic_fetch_oldop_str>")
++	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))
+    (set (match_dup 1)
+ 	(unspec_volatile:QHSD
+ 	  [(syncop:QHSD (match_dup 1)
+-	     (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>"))
++	     (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_fetch_op_str>"))
+ 	   (match_operand:SI 3 "const_int_operand")]		;; model
+ 	  VUNSPEC_ATOMIC_OP))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:QHSD 4 "=&r"))
+-   (clobber (match_scratch:SI 5 "=&r"))]
++   (clobber (match_scratch:QHSD 4 "=&r,<thumb1_atomic_fetch_newop_str>"))
++   (clobber (match_scratch:SI 5 "=&r,&l,&l,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -309,21 +370,24 @@
+     arm_split_atomic_op (<CODE>, operands[0], operands[4], operands[1],
+ 			 operands[2], operands[3], operands[5]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
+ 
++;; Constraints of this pattern must be at least as strict as those of the non
++;; atomic NANDs in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_fetch_nand<mode>"
+-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
+-	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))
++  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r")
++	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua"))
+    (set (match_dup 1)
+ 	(unspec_volatile:QHSD
+ 	  [(not:QHSD
+ 	     (and:QHSD (match_dup 1)
+-	       (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>")))
++	       (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,l")))
+ 	   (match_operand:SI 3 "const_int_operand")]		;; model
+ 	  VUNSPEC_ATOMIC_OP))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:QHSD 4 "=&r"))
+-   (clobber (match_scratch:SI 5 "=&r"))]
++   (clobber (match_scratch:QHSD 4 "=&r,&l"))
++   (clobber (match_scratch:SI 5 "=&r,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -332,20 +396,23 @@
+     arm_split_atomic_op (NOT, operands[0], operands[4], operands[1],
+ 			 operands[2], operands[3], operands[5]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb")])
+ 
++;; Constraints of this pattern must be at least as strict as those of the non
++;; atomic operations in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_<sync_optab>_fetch<mode>"
+-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
++  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,<thumb1_atomic_newop_str>")
+ 	(syncop:QHSD
+-	  (match_operand:QHSD 1 "mem_noofs_operand" "+Ua")
+-	  (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>")))
++	  (match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua,Ua")
++	  (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_op_str>")))
+    (set (match_dup 1)
+ 	(unspec_volatile:QHSD
+ 	  [(match_dup 1) (match_dup 2)
+ 	   (match_operand:SI 3 "const_int_operand")]		;; model
+ 	  VUNSPEC_ATOMIC_OP))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:SI 4 "=&r"))]
++   (clobber (match_scratch:SI 4 "=&r,&l,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -354,21 +421,24 @@
+     arm_split_atomic_op (<CODE>, NULL, operands[0], operands[1],
+ 			 operands[2], operands[3], operands[4]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb,v8mb")])
+ 
++;; Constraints of this pattern must be at least as strict as those of the non
++;; atomic NANDs in thumb1.md and aim to be as permissive.
+ (define_insn_and_split "atomic_nand_fetch<mode>"
+-  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
++  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&l")
+ 	(not:QHSD
+ 	  (and:QHSD
+-	    (match_operand:QHSD 1 "mem_noofs_operand" "+Ua")
+-	    (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>"))))
++	    (match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua")
++	    (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,l"))))
+    (set (match_dup 1)
+ 	(unspec_volatile:QHSD
+ 	  [(match_dup 1) (match_dup 2)
+ 	   (match_operand:SI 3 "const_int_operand")]		;; model
+ 	  VUNSPEC_ATOMIC_OP))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:SI 4 "=&r"))]
++   (clobber (match_scratch:SI 4 "=&r,&l"))]
+   "<sync_predtab>"
+   "#"
+   "&& reload_completed"
+@@ -377,48 +447,61 @@
+     arm_split_atomic_op (NOT, NULL, operands[0], operands[1],
+ 			 operands[2], operands[3], operands[4]);
+     DONE;
+-  })
++  }
++  [(set_attr "arch" "32,v8mb")])
+ 
+ (define_insn "arm_load_exclusive<mode>"
+-  [(set (match_operand:SI 0 "s_register_operand" "=r")
++  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
+         (zero_extend:SI
+ 	  (unspec_volatile:NARROW
+-	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua")]
++	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua,Ua")]
+ 	    VUNSPEC_LL)))]
+   "TARGET_HAVE_LDREXBH"
+-  "ldrex<sync_sfx>%?\t%0, %C1"
+-  [(set_attr "predicable" "yes")
++  "@
++   ldrex<sync_sfx>%?\t%0, %C1
++   ldrex<sync_sfx>\t%0, %C1"
++  [(set_attr "arch" "32,v8mb")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+ 
+ (define_insn "arm_load_acquire_exclusive<mode>"
+-  [(set (match_operand:SI 0 "s_register_operand" "=r")
++  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
+         (zero_extend:SI
+ 	  (unspec_volatile:NARROW
+-	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua")]
++	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua,Ua")]
+ 	    VUNSPEC_LAX)))]
+   "TARGET_HAVE_LDACQ"
+-  "ldaex<sync_sfx>%?\\t%0, %C1"
+-  [(set_attr "predicable" "yes")
++  "@
++   ldaex<sync_sfx>%?\\t%0, %C1
++   ldaex<sync_sfx>\\t%0, %C1"
++  [(set_attr "arch" "32,v8mb")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+ 
+ (define_insn "arm_load_exclusivesi"
+-  [(set (match_operand:SI 0 "s_register_operand" "=r")
++  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
+ 	(unspec_volatile:SI
+-	  [(match_operand:SI 1 "mem_noofs_operand" "Ua")]
++	  [(match_operand:SI 1 "mem_noofs_operand" "Ua,Ua")]
+ 	  VUNSPEC_LL))]
+   "TARGET_HAVE_LDREX"
+-  "ldrex%?\t%0, %C1"
+-  [(set_attr "predicable" "yes")
++  "@
++   ldrex%?\t%0, %C1
++   ldrex\t%0, %C1"
++  [(set_attr "arch" "32,v8mb")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+ 
+ (define_insn "arm_load_acquire_exclusivesi"
+-  [(set (match_operand:SI 0 "s_register_operand" "=r")
++  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
+ 	(unspec_volatile:SI
+-	  [(match_operand:SI 1 "mem_noofs_operand" "Ua")]
++	  [(match_operand:SI 1 "mem_noofs_operand" "Ua,Ua")]
+ 	  VUNSPEC_LAX))]
+   "TARGET_HAVE_LDACQ"
+-  "ldaex%?\t%0, %C1"
+-  [(set_attr "predicable" "yes")
++  "@
++   ldaex%?\t%0, %C1
++   ldaex\t%0, %C1"
++  [(set_attr "arch" "32,v8mb")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+ 
+ (define_insn "arm_load_exclusivedi"
+@@ -436,7 +519,7 @@
  	(unspec_volatile:DI
  	  [(match_operand:DI 1 "mem_noofs_operand" "Ua")]
  	  VUNSPEC_LAX))]
@@ -57994,7 +63569,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "ldaexd%?\t%0, %H0, %C1"
    [(set_attr "predicable" "yes")
     (set_attr "predicable_short_it" "no")])
-@@ -452,14 +452,13 @@
+@@ -452,16 +535,18 @@
    {
      if (<MODE>mode == DImode)
        {
@@ -58011,9 +63586,15 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +	gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
 +	return "strexd%?\t%0, %2, %H2, %C1";
        }
-     return "strex<sync_sfx>%?\t%0, %2, %C1";
+-    return "strex<sync_sfx>%?\t%0, %2, %C1";
++    if (TARGET_THUMB1)
++      return "strex<sync_sfx>\t%0, %2, %C1";
++    else
++      return "strex<sync_sfx>%?\t%0, %2, %C1";
    }
-@@ -473,13 +472,11 @@
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
+@@ -473,25 +558,26 @@
  	(unspec_volatile:DI
  	  [(match_operand:DI 2 "s_register_operand" "r")]
  	  VUNSPEC_SLX))]
@@ -58030,6 +63611,26 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    }
    [(set_attr "predicable" "yes")
     (set_attr "predicable_short_it" "no")])
+ 
+ (define_insn "arm_store_release_exclusive<mode>"
+-  [(set (match_operand:SI 0 "s_register_operand" "=&r")
++  [(set (match_operand:SI 0 "s_register_operand" "=&r,&r")
+ 	(unspec_volatile:SI [(const_int 0)] VUNSPEC_SLX))
+-   (set (match_operand:QHSI 1 "mem_noofs_operand" "=Ua")
++   (set (match_operand:QHSI 1 "mem_noofs_operand" "=Ua,Ua")
+ 	(unspec_volatile:QHSI
+-	  [(match_operand:QHSI 2 "s_register_operand" "r")]
++	  [(match_operand:QHSI 2 "s_register_operand" "r,r")]
+ 	  VUNSPEC_SLX))]
+   "TARGET_HAVE_LDACQ"
+-  "stlex<sync_sfx>%?\t%0, %2, %C1"
+-  [(set_attr "predicable" "yes")
++  "@
++   stlex<sync_sfx>%?\t%0, %2, %C1
++   stlex<sync_sfx>\t%0, %2, %C1"
++  [(set_attr "arch" "32,v8mb")
++   (set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")])
 --- a/src/gcc/config/arm/t-aprofile
 +++ b/src/gcc/config/arm/t-aprofile
 @@ -49,38 +49,33 @@ MULTILIB_DIRNAMES      += fpv3 simdv1 fpv4 simdvfpv4 simdv8
@@ -58186,9 +63787,199 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    $(srcdir)/config/arm/arm-simd-builtin-types.def
  	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
  		$(srcdir)/config/arm/arm-builtins.c
+--- /dev/null
++++ b/src/gcc/config/arm/t-rmprofile
+@@ -0,0 +1,176 @@
++# Copyright (C) 2016 Free Software Foundation, Inc.
++#
++# This file is part of GCC.
++#
++# GCC is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3, or (at your option)
++# any later version.
++#
++# GCC is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++# This is a target makefile fragment that attempts to get
++# multilibs built for the range of CPU's, FPU's and ABI's that
++# are relevant for the ARM architecture.  It should not be used in
++# conjunction with another make file fragment and assumes --with-arch,
++# --with-cpu, --with-fpu, --with-float, --with-mode have their default
++# values during the configure step.  We enforce this during the
++# top-level configury.
++
++MULTILIB_OPTIONS     =
++MULTILIB_DIRNAMES    =
++MULTILIB_EXCEPTIONS  =
++MULTILIB_MATCHES     =
++MULTILIB_REUSE       =
++
++# We have the following hierachy:
++#   ISA: A32 (.) or T16/T32 (thumb).
++#   Architecture: ARMv6S-M (v6-m), ARMv7-M (v7-m), ARMv7E-M (v7e-m),
++#                 ARMv8-M Baseline (v8-m.base) or ARMv8-M Mainline (v8-m.main).
++#   FPU: VFPv3-D16 (fpv3), FPV4-SP-D16 (fpv4-sp), FPV5-SP-D16 (fpv5-sp),
++#        VFPv5-D16 (fpv5), or None (.).
++#   Float-abi: Soft (.), softfp (softfp), or hard (hardfp).
++
++# Options to build libraries with
++
++MULTILIB_OPTIONS       += mthumb
++MULTILIB_DIRNAMES      += thumb
++
++MULTILIB_OPTIONS       += march=armv6s-m/march=armv7-m/march=armv7e-m/march=armv7/march=armv8-m.base/march=armv8-m.main
++MULTILIB_DIRNAMES      += v6-m v7-m v7e-m v7-ar v8-m.base v8-m.main
++
++MULTILIB_OPTIONS       += mfpu=vfpv3-d16/mfpu=fpv4-sp-d16/mfpu=fpv5-sp-d16/mfpu=fpv5-d16
++MULTILIB_DIRNAMES      += fpv3 fpv4-sp fpv5-sp fpv5
++
++MULTILIB_OPTIONS       += mfloat-abi=softfp/mfloat-abi=hard
++MULTILIB_DIRNAMES      += softfp hard
++
++
++# Option combinations to build library with
++
++# Default CPU/Arch
++MULTILIB_REQUIRED      += mthumb
++MULTILIB_REQUIRED      += mfloat-abi=hard
++
++# ARMv6-M
++MULTILIB_REQUIRED      += mthumb/march=armv6s-m
++
++# ARMv8-M Baseline
++MULTILIB_REQUIRED      += mthumb/march=armv8-m.base
++
++# ARMv7-M
++MULTILIB_REQUIRED      += mthumb/march=armv7-m
++
++# ARMv7E-M
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv4-sp-d16/mfloat-abi=softfp
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv4-sp-d16/mfloat-abi=hard
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-d16/mfloat-abi=softfp
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-d16/mfloat-abi=hard
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-sp-d16/mfloat-abi=softfp
++MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-sp-d16/mfloat-abi=hard
++
++# ARMv8-M Mainline
++MULTILIB_REQUIRED      += mthumb/march=armv8-m.main
++MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-d16/mfloat-abi=softfp
++MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-d16/mfloat-abi=hard
++MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-sp-d16/mfloat-abi=softfp
++MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-sp-d16/mfloat-abi=hard
++
++# ARMv7-R as well as ARMv7-A and ARMv8-A if aprofile was not specified
++MULTILIB_REQUIRED      += mthumb/march=armv7
++MULTILIB_REQUIRED      += mthumb/march=armv7/mfpu=vfpv3-d16/mfloat-abi=softfp
++MULTILIB_REQUIRED      += mthumb/march=armv7/mfpu=vfpv3-d16/mfloat-abi=hard
++
++
++# Matches
++
++# CPU Matches
++MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0
++MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0.small-multiply
++MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0plus
++MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0plus.small-multiply
++MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m1
++MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m1.small-multiply
++MULTILIB_MATCHES       += march?armv7-m=mcpu?cortex-m3
++MULTILIB_MATCHES       += march?armv7e-m=mcpu?cortex-m4
++MULTILIB_MATCHES       += march?armv7e-m=mcpu?cortex-m7
++MULTILIB_MATCHES       += march?armv8-m.base=mcpu?cortex-m23
++MULTILIB_MATCHES       += march?armv8-m.main=mcpu?cortex-m33
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r4
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r4f
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r5
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r7
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r8
++MULTILIB_MATCHES       += march?armv7=mcpu?marvell-pj4
++MULTILIB_MATCHES       += march?armv7=mcpu?generic-armv7-a
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a8
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a9
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a5
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a7
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a15
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a12
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a17
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a15.cortex-a7
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a17.cortex-a7
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a32
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a35
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a53
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a57
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a57.cortex-a53
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a72
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a72.cortex-a53
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73.cortex-a35
++MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73.cortex-a53
++MULTILIB_MATCHES       += march?armv7=mcpu?exynos-m1
++MULTILIB_MATCHES       += march?armv7=mcpu?qdf24xx
++MULTILIB_MATCHES       += march?armv7=mcpu?xgene1
++
++# Arch Matches
++MULTILIB_MATCHES       += march?armv6s-m=march?armv6-m
++MULTILIB_MATCHES       += march?armv8-m.main=march?armv8-m.main+dsp
++MULTILIB_MATCHES       += march?armv7=march?armv7-r
++ifeq (,$(HAS_APROFILE))
++MULTILIB_MATCHES       += march?armv7=march?armv7-a
++MULTILIB_MATCHES       += march?armv7=march?armv7ve
++MULTILIB_MATCHES       += march?armv7=march?armv8-a
++MULTILIB_MATCHES       += march?armv7=march?armv8-a+crc
++MULTILIB_MATCHES       += march?armv7=march?armv8.1-a
++MULTILIB_MATCHES       += march?armv7=march?armv8.1-a+crc
++MULTILIB_MATCHES       += march?armv7=march?armv8.2-a
++MULTILIB_MATCHES       += march?armv7=march?armv8.2-a+fp16
++endif
++
++# FPU matches
++ifeq (,$(HAS_APROFILE))
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-d16-fp16
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon-fp16
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv4
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv4-d16
++MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon-vfpv4
++MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?fp-armv8
++MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?neon-fp-armv8
++MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?crypto-neon-fp-armv8
++endif
++
++
++# We map all requests for ARMv7-R or ARMv7-A in ARM mode to Thumb mode and
++# any FPU to VFPv3-d16 if possible.
++MULTILIB_REUSE         += mthumb/march.armv7=march.armv7
++MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp
++MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard
++MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7/mfpu.fpv5-d16/mfloat-abi.softfp
++MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7/mfpu.fpv5-d16/mfloat-abi.hard
++MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7/mfpu.fpv5-d16/mfloat-abi.softfp
++MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7/mfpu.fpv5-d16/mfloat-abi.hard
 --- a/src/gcc/config/arm/thumb1.md
 +++ b/src/gcc/config/arm/thumb1.md
-@@ -114,8 +114,8 @@
+@@ -55,6 +55,10 @@
+    (set_attr "type" "multiple")]
+ )
+ 
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic additions in sync.md and to the logic for bind_old_new in
++;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
++;; constraints here and aim to be as permissive.
+ (define_insn_and_split "*thumb1_addsi3"
+   [(set (match_operand:SI          0 "register_operand" "=l,l,l,*rk,*hk,l,k,l,l,l")
+ 	(plus:SI (match_operand:SI 1 "register_operand" "%0,0,l,*0,*0,k,k,0,l,k")
+@@ -114,8 +118,8 @@
     (set (match_dup 0)
  	(plus:SI (match_dup 0) (reg:SI SP_REGNUM)))]
    "TARGET_THUMB1
@@ -58199,7 +63990,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set (match_dup 0) (plus:SI (reg:SI SP_REGNUM) (match_dup 1)))]
    ""
  )
-@@ -142,11 +142,11 @@
+@@ -131,6 +135,10 @@
+    (set_attr "type" "multiple")]
+ )
+ 
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic subtractions in sync.md and to the logic for bind_old_new in
++;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
++;; constraints here and aim to be as permissive.
+ (define_insn "thumb1_subsi3_insn"
+   [(set (match_operand:SI           0 "register_operand" "=l")
+ 	(minus:SI (match_operand:SI 1 "register_operand" "l")
+@@ -142,11 +150,11 @@
     (set_attr "type" "alus_sreg")]
  )
  
@@ -58216,7 +64018,40 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*thumb_mulsi3"
    [(set (match_operand:SI          0 "register_operand" "=&l,&l,&l")
  	(mult:SI (match_operand:SI 1 "register_operand" "%l,*h,0")
-@@ -590,8 +590,8 @@
+@@ -173,6 +181,10 @@
+    (set_attr "type" "muls")]
+ )
+ 
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic bitwise ANDs and NANDs in sync.md and to the logic for bind_old_new
++;; in arm_split_atomic_op in arm.c.  These must be at least as strict as the
++;; constraints here and aim to be as permissive.
+ (define_insn "*thumb1_andsi3_insn"
+   [(set (match_operand:SI         0 "register_operand" "=l")
+ 	(and:SI (match_operand:SI 1 "register_operand" "%0")
+@@ -227,6 +239,10 @@
+    (set_attr "type" "logics_reg")]
+ )
+ 
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic inclusive ORs in sync.md and to the logic for bind_old_new in
++;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
++;; constraints here and aim to be as permissive.
+ (define_insn "*thumb1_iorsi3_insn"
+   [(set (match_operand:SI         0 "register_operand" "=l")
+ 	(ior:SI (match_operand:SI 1 "register_operand" "%0")
+@@ -237,6 +253,10 @@
+    (set_attr "conds" "set")
+    (set_attr "type" "logics_reg")])
+ 
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic exclusive ORs in sync.md and to the logic for bind_old_new in
++;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
++;; constraints here and aim to be as permissive.
+ (define_insn "*thumb1_xorsi3_insn"
+   [(set (match_operand:SI         0 "register_operand" "=l")
+ 	(xor:SI (match_operand:SI 1 "register_operand" "%0")
+@@ -590,8 +610,8 @@
  ;;; ??? The 'i' constraint looks funny, but it should always be replaced by
  ;;; thumb_reorg with a memory reference.
  (define_insn "*thumb1_movdi_insn"
@@ -58227,7 +64062,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "TARGET_THUMB1
     && (   register_operand (operands[0], DImode)
         || register_operand (operands[1], DImode))"
-@@ -610,36 +610,41 @@
+@@ -610,36 +630,41 @@
        operands[1] = GEN_INT (- INTVAL (operands[1]));
        return \"movs\\t%Q0, %1\;rsbs\\t%Q0, %Q0, #0\;asrs\\t%R0, %Q0, #31\";
      case 3:
@@ -58278,7 +64113,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     #
     #
     ldmia\\t%1, {%0}
-@@ -647,10 +652,11 @@
+@@ -647,10 +672,11 @@
     ldr\\t%0, %1
     str\\t%1, %0
     mov\\t%0, %1"
@@ -58294,7 +64129,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  ; Split the load of 64-bit constant into two loads for high and low 32-bit parts respectively
  ; to see if we can load them in fewer instructions or fewer cycles.
-@@ -687,7 +693,8 @@
+@@ -687,7 +713,8 @@
  (define_split
    [(set (match_operand:SI 0 "register_operand" "")
  	(match_operand:SI 1 "const_int_operand" ""))]
@@ -58304,7 +64139,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set (match_dup 2) (match_dup 1))
     (set (match_dup 0) (ashift:SI (match_dup 2) (match_dup 3)))]
    "
-@@ -714,7 +721,8 @@
+@@ -714,7 +741,8 @@
  (define_split
    [(set (match_operand:SI 0 "register_operand" "")
  	(match_operand:SI 1 "const_int_operand" ""))]
@@ -58314,7 +64149,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    [(set (match_dup 2) (match_dup 1))
     (set (match_dup 0) (plus:SI (match_dup 2) (match_dup 3)))]
    "
-@@ -726,8 +734,8 @@
+@@ -726,8 +754,8 @@
  )
  
  (define_insn "*thumb1_movhi_insn"
@@ -58325,7 +64160,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "TARGET_THUMB1
     && (   register_operand (operands[0], HImode)
         || register_operand (operands[1], HImode))"
-@@ -739,6 +747,8 @@
+@@ -739,6 +767,8 @@
      case 3: return \"mov	%0, %1\";
      case 4: return \"mov	%0, %1\";
      case 5: return \"movs	%0, %1\";
@@ -58334,7 +64169,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      default: gcc_unreachable ();
      case 1:
        /* The stack pointer can end up being taken as an index register.
-@@ -758,9 +768,10 @@
+@@ -758,9 +788,10 @@
  	}
        return \"ldrh	%0, %1\";
      }"
@@ -58348,7 +64183,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  (define_expand "thumb_movhi_clobber"
    [(set (match_operand:HI     0 "memory_operand"   "")
-@@ -963,6 +974,91 @@
+@@ -963,6 +994,94 @@
    DONE;
  })
  
@@ -58437,12 +64272,179 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +	    (const_string "multiple")))]
 +)
 +
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic compare_and_swap splitters in sync.md.  These must be at least as
++;; strict as the constraints here and aim to be as permissive.
  (define_insn "cbranchsi4_insn"
    [(set (pc) (if_then_else
  	      (match_operator 0 "arm_comparison_operator"
+@@ -1024,6 +1143,9 @@
+    (set_attr "type" "multiple")]
+ )
+ 
++;; Changes to the constraints of this pattern must be propagated to those of
++;; atomic compare_and_swap splitters in sync.md.  These must be at least as
++;; strict as the constraints here and aim to be as permissive.
+ (define_insn "cbranchsi4_scratch"
+   [(set (pc) (if_then_else
+ 	      (match_operator 4 "arm_comparison_operator"
+@@ -1609,6 +1731,19 @@
+    (set_attr "type" "call")]
+ )
+ 
++(define_insn "*nonsecure_call_reg_thumb1_v5"
++  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "register_operand" "l*r"))]
++		    UNSPEC_NONSECURE_MEM)
++	 (match_operand 1 "" ""))
++   (use (match_operand 2 "" ""))
++   (clobber (reg:SI LR_REGNUM))
++   (clobber (match_dup 0))]
++  "TARGET_THUMB1 && use_cmse && !SIBLING_CALL_P (insn)"
++  "bl\\t__gnu_cmse_nonsecure_call"
++  [(set_attr "length" "4")
++   (set_attr "type" "call")]
++)
++
+ (define_insn "*call_reg_thumb1"
+   [(call (mem:SI (match_operand:SI 0 "register_operand" "l*r"))
+ 	 (match_operand 1 "" ""))
+@@ -1641,6 +1776,21 @@
+    (set_attr "type" "call")]
+ )
+ 
++(define_insn "*nonsecure_call_value_reg_thumb1_v5"
++  [(set (match_operand 0 "" "")
++	(call (unspec:SI
++	       [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
++	       UNSPEC_NONSECURE_MEM)
++	      (match_operand 2 "" "")))
++   (use (match_operand 3 "" ""))
++   (clobber (reg:SI LR_REGNUM))
++   (clobber (match_dup 1))]
++  "TARGET_THUMB1 && use_cmse"
++  "bl\\t__gnu_cmse_nonsecure_call"
++  [(set_attr "length" "4")
++   (set_attr "type" "call")]
++)
++
+ (define_insn "*call_value_reg_thumb1"
+   [(set (match_operand 0 "" "")
+ 	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
+@@ -1747,8 +1897,13 @@
+   "*
+     return thumb1_unexpanded_epilogue ();
+   "
+-  ; Length is absolute worst case
+-  [(set_attr "length" "44")
++  ; Length is absolute worst case, when using CMSE and if this is an entry
++  ; function an extra 4 (MSR) bytes will be added.
++  [(set (attr "length")
++	(if_then_else
++	 (match_test "IS_CMSE_ENTRY (arm_current_func_type ())")
++	 (const_int 48)
++	 (const_int 44)))
+    (set_attr "type" "block")
+    ;; We don't clobber the conditions, but the potential length of this
+    ;; operation is sufficient to make conditionalizing the sequence
+--- a/src/gcc/config/arm/thumb2.md
++++ b/src/gcc/config/arm/thumb2.md
+@@ -278,8 +278,7 @@
+ (define_insn "*thumb2_movsi_insn"
+   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l ,*hk,m,*m")
+ 	(match_operand:SI 1 "general_operand"	   "rk,I,Py,K,j,mi,*mi,l,*hk"))]
+-  "TARGET_THUMB2 && ! TARGET_IWMMXT
+-   && !(TARGET_HARD_FLOAT && TARGET_VFP)
++  "TARGET_THUMB2 && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
+    && (   register_operand (operands[0], SImode)
+        || register_operand (operands[1], SImode))"
+   "@
+@@ -581,6 +580,19 @@
+   [(set_attr "type" "call")]
+ )
+ 
++(define_insn "*nonsecure_call_reg_thumb2"
++  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "s_register_operand" "r"))]
++		    UNSPEC_NONSECURE_MEM)
++	 (match_operand 1 "" ""))
++   (use (match_operand 2 "" ""))
++   (clobber (reg:SI LR_REGNUM))
++   (clobber (match_dup 0))]
++  "TARGET_THUMB2 && use_cmse"
++  "bl\\t__gnu_cmse_nonsecure_call"
++  [(set_attr "length" "4")
++   (set_attr "type" "call")]
++)
++
+ (define_insn "*call_value_reg_thumb2"
+   [(set (match_operand 0 "" "")
+ 	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
+@@ -592,6 +604,21 @@
+   [(set_attr "type" "call")]
+ )
+ 
++(define_insn "*nonsecure_call_value_reg_thumb2"
++  [(set (match_operand 0 "" "")
++	(call
++	 (unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
++		    UNSPEC_NONSECURE_MEM)
++	 (match_operand 2 "" "")))
++   (use (match_operand 3 "" ""))
++   (clobber (reg:SI LR_REGNUM))
++   (clobber (match_dup 1))]
++  "TARGET_THUMB2 && use_cmse"
++  "bl\t__gnu_cmse_nonsecure_call"
++  [(set_attr "length" "4")
++   (set_attr "type" "call")]
++)
++
+ (define_insn "*thumb2_indirect_jump"
+   [(set (pc)
+ 	(match_operand:SI 0 "register_operand" "l*r"))]
+@@ -1115,12 +1142,31 @@
+ 
+ (define_insn "*thumb2_return"
+   [(simple_return)]
+-  "TARGET_THUMB2"
++  "TARGET_THUMB2 && !IS_CMSE_ENTRY (arm_current_func_type ())"
+   "* return output_return_instruction (const_true_rtx, true, false, true);"
+   [(set_attr "type" "branch")
+    (set_attr "length" "4")]
+ )
+ 
++(define_insn "*thumb2_cmse_entry_return"
++  [(simple_return)]
++  "TARGET_THUMB2 && IS_CMSE_ENTRY (arm_current_func_type ())"
++  "* return output_return_instruction (const_true_rtx, true, false, true);"
++  [(set_attr "type" "branch")
++   ; This is a return from a cmse_nonsecure_entry function so code will be
++   ; added to clear the APSR and potentially the FPSCR if VFP is available, so
++   ; we adapt the length accordingly.
++   (set (attr "length")
++     (if_then_else (match_test "TARGET_HARD_FLOAT")
++      (const_int 12)
++      (const_int 8)))
++   ; We do not support predicate execution of returns from cmse_nonsecure_entry
++   ; functions because we need to clear the APSR.  Since predicable has to be
++   ; a constant, we had to duplicate the thumb2_return pattern for CMSE entry
++   ; functions.
++   (set_attr "predicable" "no")]
++)
++
+ (define_insn_and_split "thumb2_eh_return"
+   [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")]
+ 		    VUNSPEC_EH_RETURN)
 --- a/src/gcc/config/arm/unspecs.md
 +++ b/src/gcc/config/arm/unspecs.md
-@@ -191,6 +191,8 @@
+@@ -84,6 +84,8 @@
+   UNSPEC_VRINTA         ; Represent a float to integral float rounding
+                         ; towards nearest, ties away from zero.
+   UNSPEC_PROBE_STACK    ; Probe stack memory reference
++  UNSPEC_NONSECURE_MEM	; Represent non-secure memory in ARMv8-M with
++			; security extension
+ ])
+ 
+ (define_c_enum "unspec" [
+@@ -191,6 +193,8 @@
    UNSPEC_VBSL
    UNSPEC_VCAGE
    UNSPEC_VCAGT
@@ -58451,7 +64453,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    UNSPEC_VCEQ
    UNSPEC_VCGE
    UNSPEC_VCGEU
-@@ -203,6 +205,20 @@
+@@ -203,6 +207,20 @@
    UNSPEC_VCVT_U
    UNSPEC_VCVT_S_N
    UNSPEC_VCVT_U_N
@@ -58472,7 +64474,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    UNSPEC_VEXT
    UNSPEC_VHADD_S
    UNSPEC_VHADD_U
-@@ -244,6 +260,8 @@
+@@ -244,6 +262,8 @@
    UNSPEC_VMLSL_S_LANE
    UNSPEC_VMLSL_U_LANE
    UNSPEC_VMLSL_LANE
@@ -58481,7 +64483,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    UNSPEC_VMOVL_S
    UNSPEC_VMOVL_U
    UNSPEC_VMOVN
-@@ -365,5 +383,11 @@
+@@ -365,5 +385,11 @@
    UNSPEC_NVRINTN
    UNSPEC_VQRDMLAH
    UNSPEC_VQRDMLSH
@@ -58519,7 +64521,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (match_operand:VE 1 "s_register_operand" "")
 --- a/src/gcc/config/arm/vfp.md
 +++ b/src/gcc/config/arm/vfp.md
-@@ -18,6 +18,199 @@
+@@ -18,13 +18,206 @@
  ;; along with GCC; see the file COPYING3.  If not see
  ;; <http://www.gnu.org/licenses/>.  */
  
@@ -58531,7 +64533,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    "=rk,  r, r, m, r, *t,  r, *t")
 +   (match_operand:HI 1 "general_operand"
 +    "rIk, K, n, r, mi, r, *t, *t"))]
-+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
++ "TARGET_ARM && TARGET_HARD_FLOAT
 +  && !TARGET_VFP_FP16INST
 +  && (register_operand (operands[0], HImode)
 +       || register_operand (operands[1], HImode))"
@@ -58582,7 +64584,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    "=rk, r, l, r, m, r, *t, r, *t")
 +   (match_operand:HI 1 "general_operand"
 +    "rk, I, Py, n, r, m, r, *t, *t"))]
-+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
++ "TARGET_THUMB2 && TARGET_HARD_FLOAT
 +  && !TARGET_VFP_FP16INST
 +  && (register_operand (operands[0], HImode)
 +       || register_operand (operands[1], HImode))"
@@ -58719,6 +64721,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ;; SImode moves
  ;; ??? For now do not allow loading constants into vfp regs.  This causes
  ;; problems because small constants get converted into adds.
+ (define_insn "*arm_movsi_vfp"
+   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m ,*t,r,*t,*t, *Uv")
+       (match_operand:SI 1 "general_operand"	   "rk, I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))]
+-  "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT
++  "TARGET_ARM && TARGET_HARD_FLOAT
+    && (   s_register_operand (operands[0], SImode)
+        || s_register_operand (operands[1], SImode))"
+   "*
 @@ -53,7 +246,8 @@
      }
    "
@@ -58729,6 +64739,33 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
     (set_attr "pool_range"     "*,*,*,*,4096,*,*,*,*,1020,*")
     (set_attr "neg_pool_range" "*,*,*,*,4084,*,*,*,*,1008,*")]
  )
+@@ -66,7 +260,7 @@
+ (define_insn "*thumb2_movsi_vfp"
+   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r, l,*hk,m, *m,*t, r,*t,*t,  *Uv")
+ 	(match_operand:SI 1 "general_operand"	   "rk,I,Py,K,j,mi,*mi,l,*hk, r,*t,*t,*Uvi,*t"))]
+-  "TARGET_THUMB2 && TARGET_VFP && TARGET_HARD_FLOAT
++  "TARGET_THUMB2 && TARGET_HARD_FLOAT
+    && (   s_register_operand (operands[0], SImode)
+        || s_register_operand (operands[1], SImode))"
+   "*
+@@ -112,7 +306,7 @@
+ (define_insn "*movdi_vfp"
+   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,r,w,w, Uv")
+        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune != cortexa8
++  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune != cortexa8
+    && (   register_operand (operands[0], DImode)
+        || register_operand (operands[1], DImode))
+    && !(TARGET_NEON && CONST_INT_P (operands[1])
+@@ -163,7 +357,7 @@
+ (define_insn "*movdi_vfp_cortexa8"
+   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,r,r,m,w,!r,w,w, Uv")
+        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,r,r,w,w,Uvi,w"))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune == cortexa8
++  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune == cortexa8
+     && (   register_operand (operands[0], DImode)
+         || register_operand (operands[1], DImode))
+     && !(TARGET_NEON && CONST_INT_P (operands[1])
 @@ -211,10 +405,87 @@
   )
  
@@ -58824,24 +64861,44 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	(match_operand:HF 1 "general_operand"	   " m,r,t,r,r,t,F"))]
 -  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16 && !TARGET_NEON_FP16
 +  "TARGET_32BIT
-+   && TARGET_HARD_FLOAT && TARGET_VFP
++   && TARGET_HARD_FLOAT
 +   && !TARGET_NEON_FP16
 +   && !TARGET_VFP_FP16INST
     && (   s_register_operand (operands[0], HFmode)
         || s_register_operand (operands[1], HFmode))"
    "*
-@@ -394,8 +668,8 @@
+@@ -321,7 +595,7 @@
+ (define_insn "*movsf_vfp"
+   [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t ,t  ,Uv,r ,m,t,r")
+         (match_operand:SF 1 "general_operand"	   " ?r,t,Dv,UvE,t, mE,r,t,r"))]
+-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
++  "TARGET_ARM && TARGET_HARD_FLOAT
+    && (   s_register_operand (operands[0], SFmode)
+        || s_register_operand (operands[1], SFmode))"
+   "*
+@@ -357,7 +631,7 @@
+ (define_insn "*thumb2_movsf_vfp"
+   [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t, t  ,Uv,r ,m,t,r")
+ 	(match_operand:SF 1 "general_operand"	   " ?r,t,Dv,UvE,t, mE,r,t,r"))]
+-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
++  "TARGET_THUMB2 && TARGET_HARD_FLOAT
+    && (   s_register_operand (operands[0], SFmode)
+        || s_register_operand (operands[1], SFmode))"
+   "*
+@@ -394,9 +668,9 @@
  ;; DFmode moves
  
  (define_insn "*movdf_vfp"
 -  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r, m,w,r")
 -	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w ,mF,r,w,r"))]
+-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
 +  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r, m,w,r")
 +	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w ,mF,r,w,r"))]
-   "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
++  "TARGET_ARM && TARGET_HARD_FLOAT
     && (   register_operand (operands[0], DFmode)
         || register_operand (operands[1], DFmode))"
-@@ -410,39 +684,43 @@
+   "*
+@@ -410,40 +684,44 @@
        case 2:
  	gcc_assert (TARGET_VFP_DOUBLE);
          return \"vmov%?.f64\\t%P0, %1\";
@@ -58892,11 +64949,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*thumb2_movdf_vfp"
 -  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r ,m,w,r")
 -	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w, mF,r, w,r"))]
+-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
 +  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r ,m,w,r")
 +	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w, mF,r, w,r"))]
-   "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
++  "TARGET_THUMB2 && TARGET_HARD_FLOAT
     && (   register_operand (operands[0], DFmode)
         || register_operand (operands[1], DFmode))"
+   "*
 @@ -457,11 +735,14 @@
        case 2:
  	gcc_assert (TARGET_VFP_DOUBLE);
@@ -58939,7 +64998,43 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  )
  
  
-@@ -661,9 +943,63 @@
+@@ -494,7 +776,7 @@
+ 	    [(match_operand 4 "cc_register" "") (const_int 0)])
+ 	  (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t")
+ 	  (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))]
+-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_ARM && TARGET_HARD_FLOAT"
+   "@
+    vmov%D3.f32\\t%0, %2
+    vmov%d3.f32\\t%0, %1
+@@ -517,7 +799,7 @@
+ 	    [(match_operand 4 "cc_register" "") (const_int 0)])
+ 	  (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t")
+ 	  (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))]
+-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP && !arm_restrict_it"
++  "TARGET_THUMB2 && TARGET_HARD_FLOAT && !arm_restrict_it"
+   "@
+    it\\t%D3\;vmov%D3.f32\\t%0, %2
+    it\\t%d3\;vmov%d3.f32\\t%0, %1
+@@ -585,7 +867,7 @@
+ (define_insn "*abssf2_vfp"
+   [(set (match_operand:SF	  0 "s_register_operand" "=t")
+ 	(abs:SF (match_operand:SF 1 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vabs%?.f32\\t%0, %1"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -605,7 +887,7 @@
+ (define_insn "*negsf2_vfp"
+   [(set (match_operand:SF	  0 "s_register_operand" "=t,?r")
+ 	(neg:SF (match_operand:SF 1 "s_register_operand" "t,r")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "@
+    vneg%?.f32\\t%0, %1
+    eor%?\\t%0, %1, #-2147483648"
+@@ -661,14 +943,68 @@
     (set_attr "type" "ffarithd")]
  )
  
@@ -59003,7 +65098,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*addsf3_vfp"
    [(set (match_operand:SF	   0 "s_register_operand" "=t")
  	(plus:SF (match_operand:SF 1 "s_register_operand" "t")
-@@ -686,6 +1022,17 @@
+ 		 (match_operand:SF 2 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vadd%?.f32\\t%0, %1, %2"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -686,12 +1022,23 @@
     (set_attr "type" "faddd")]
  )
  
@@ -59021,6 +65122,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  (define_insn "*subsf3_vfp"
    [(set (match_operand:SF	    0 "s_register_operand" "=t")
+ 	(minus:SF (match_operand:SF 1 "s_register_operand" "t")
+ 		  (match_operand:SF 2 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vsub%?.f32\\t%0, %1, %2"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
 @@ -712,6 +1059,19 @@
  
  ;; Division insns
@@ -59041,7 +65149,16 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ; VFP9 Erratum 760019: It's potentially unsafe to overwrite the input
  ; operands, so mark the output as early clobber for VFPv2 on ARMv5 or
  ; earlier.
-@@ -742,6 +1102,17 @@
+@@ -719,7 +1079,7 @@
+   [(set (match_operand:SF	  0 "s_register_operand" "=&t,t")
+ 	(div:SF (match_operand:SF 1 "s_register_operand" "t,t")
+ 		(match_operand:SF 2 "s_register_operand" "t,t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vdiv%?.f32\\t%0, %1, %2"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -742,11 +1102,22 @@
  
  ;; Multiplication insns
  
@@ -59059,7 +65176,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*mulsf3_vfp"
    [(set (match_operand:SF	   0 "s_register_operand" "=t")
  	(mult:SF (match_operand:SF 1 "s_register_operand" "t")
-@@ -764,6 +1135,26 @@
+ 		 (match_operand:SF 2 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vmul%?.f32\\t%0, %1, %2"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -764,11 +1135,31 @@
     (set_attr "type" "fmuld")]
  )
  
@@ -59086,7 +65209,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*mulsf3negsf_vfp"
    [(set (match_operand:SF		   0 "s_register_operand" "=t")
  	(mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "t"))
-@@ -813,6 +1204,18 @@
+ 		 (match_operand:SF	   2 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && !flag_rounding_math"
++  "TARGET_32BIT && TARGET_HARD_FLOAT && !flag_rounding_math"
+   "vnmul%?.f32\\t%0, %1, %2"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -779,7 +1170,7 @@
+   [(set (match_operand:SF		   0 "s_register_operand" "=t")
+ 	(neg:SF (mult:SF (match_operand:SF 1 "s_register_operand" "t")
+ 		 (match_operand:SF	   2 "s_register_operand" "t"))))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vnmul%?.f32\\t%0, %1, %2"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -813,12 +1204,24 @@
  ;; Multiply-accumulate insns
  
  ;; 0 = 1 * 2 + 0
@@ -59105,7 +65243,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*mulsf3addsf_vfp"
    [(set (match_operand:SF		    0 "s_register_operand" "=t")
  	(plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
-@@ -838,6 +1241,17 @@
+ 			  (match_operand:SF 3 "s_register_operand" "t"))
+ 		 (match_operand:SF	    1 "s_register_operand" "0")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vmla%?.f32\\t%0, %2, %3"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -838,12 +1241,23 @@
  )
  
  ;; 0 = 1 * 2 - 0
@@ -59123,7 +65268,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*mulsf3subsf_vfp"
    [(set (match_operand:SF		     0 "s_register_operand" "=t")
  	(minus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
-@@ -863,6 +1277,17 @@
+ 			   (match_operand:SF 3 "s_register_operand" "t"))
+ 		  (match_operand:SF	     1 "s_register_operand" "0")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vnmls%?.f32\\t%0, %2, %3"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -863,12 +1277,23 @@
  )
  
  ;; 0 = -(1 * 2) + 0
@@ -59141,7 +65293,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*mulsf3negsfaddsf_vfp"
    [(set (match_operand:SF		     0 "s_register_operand" "=t")
  	(minus:SF (match_operand:SF	     1 "s_register_operand" "0")
-@@ -889,6 +1314,18 @@
+ 		  (mult:SF (match_operand:SF 2 "s_register_operand" "t")
+ 			   (match_operand:SF 3 "s_register_operand" "t"))))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vmls%?.f32\\t%0, %2, %3"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -889,13 +1314,25 @@
  
  
  ;; 0 = -(1 * 2) - 0
@@ -59160,6 +65319,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*mulsf3negsfsubsf_vfp"
    [(set (match_operand:SF		      0 "s_register_operand" "=t")
  	(minus:SF (mult:SF
+ 		    (neg:SF (match_operand:SF 2 "s_register_operand" "t"))
+ 		    (match_operand:SF	      3 "s_register_operand" "t"))
+ 		  (match_operand:SF	      1 "s_register_operand" "0")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vnmla%?.f32\\t%0, %2, %3"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
 @@ -917,6 +1354,30 @@
  
  ;; Fused-multiply-accumulate
@@ -59276,7 +65443,43 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    "vcvtb%?.f16.f32\\t%0, %1"
    [(set_attr "predicable" "yes")
     (set_attr "predicable_short_it" "no")
-@@ -1096,6 +1603,27 @@
+@@ -1013,7 +1520,7 @@
+ (define_insn "*truncsisf2_vfp"
+   [(set (match_operand:SI		  0 "s_register_operand" "=t")
+ 	(fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vcvt%?.s32.f32\\t%0, %1"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -1034,7 +1541,7 @@
+ (define_insn "fixuns_truncsfsi2"
+   [(set (match_operand:SI		  0 "s_register_operand" "=t")
+ 	(unsigned_fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vcvt%?.u32.f32\\t%0, %1"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -1055,7 +1562,7 @@
+ (define_insn "*floatsisf2_vfp"
+   [(set (match_operand:SF	    0 "s_register_operand" "=t")
+ 	(float:SF (match_operand:SI 1 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vcvt%?.f32.s32\\t%0, %1"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -1076,7 +1583,7 @@
+ (define_insn "floatunssisf2"
+   [(set (match_operand:SF	    0 "s_register_operand" "=t")
+ 	(unsigned_float:SF (match_operand:SI 1 "s_register_operand" "t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vcvt%?.f32.u32\\t%0, %1"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -1096,13 +1603,34 @@
  
  ;; Sqrt insns.
  
@@ -59304,6 +65507,65 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ; VFP9 Erratum 760019: It's potentially unsafe to overwrite the input
  ; operands, so mark the output as early clobber for VFPv2 on ARMv5 or
  ; earlier.
+ (define_insn "*sqrtsf2_vfp"
+   [(set (match_operand:SF	   0 "s_register_operand" "=&t,t")
+ 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "t,t")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vsqrt%?.f32\\t%0, %1"
+   [(set_attr "predicable" "yes")
+    (set_attr "predicable_short_it" "no")
+@@ -1127,7 +1655,7 @@
+ (define_insn "*movcc_vfp"
+   [(set (reg CC_REGNUM)
+ 	(reg VFPCC_REGNUM))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "vmrs%?\\tAPSR_nzcv, FPSCR"
+   [(set_attr "conds" "set")
+    (set_attr "type" "f_flag")]
+@@ -1137,9 +1665,9 @@
+   [(set (reg:CCFP CC_REGNUM)
+ 	(compare:CCFP (match_operand:SF 0 "s_register_operand"  "t")
+ 		      (match_operand:SF 1 "vfp_compare_operand" "tG")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "#"
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   [(set (reg:CCFP VFPCC_REGNUM)
+ 	(compare:CCFP (match_dup 0)
+ 		      (match_dup 1)))
+@@ -1152,9 +1680,9 @@
+   [(set (reg:CCFPE CC_REGNUM)
+ 	(compare:CCFPE (match_operand:SF 0 "s_register_operand"  "t")
+ 		       (match_operand:SF 1 "vfp_compare_operand" "tG")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "#"
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   [(set (reg:CCFPE VFPCC_REGNUM)
+ 	(compare:CCFPE (match_dup 0)
+ 		       (match_dup 1)))
+@@ -1203,7 +1731,7 @@
+   [(set (reg:CCFP VFPCC_REGNUM)
+ 	(compare:CCFP (match_operand:SF 0 "s_register_operand"  "t,t")
+ 		      (match_operand:SF 1 "vfp_compare_operand" "t,G")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "@
+    vcmp%?.f32\\t%0, %1
+    vcmp%?.f32\\t%0, #0"
+@@ -1216,7 +1744,7 @@
+   [(set (reg:CCFPE VFPCC_REGNUM)
+ 	(compare:CCFPE (match_operand:SF 0 "s_register_operand"  "t,t")
+ 		       (match_operand:SF 1 "vfp_compare_operand" "t,G")))]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "@
+    vcmpe%?.f32\\t%0, %1
+    vcmpe%?.f32\\t%0, #0"
 @@ -1252,9 +1780,6 @@
  )
  
@@ -59314,7 +65576,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "*combine_vcvt_f32_<FCVTI32typename>"
    [(set (match_operand:SF 0 "s_register_operand" "=t")
  	(mult:SF (FCVT:SF (match_operand:SI 1 "s_register_operand" "0"))
-@@ -1299,6 +1824,125 @@
+@@ -1299,13 +1824,132 @@
     (set_attr "type" "f_cvtf2i")]
   )
  
@@ -59440,6 +65702,14 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  ;; Store multiple insn used in function prologue.
  (define_insn "*push_multi_vfp"
    [(match_parallel 2 "multi_register_push"
+     [(set (match_operand:BLK 0 "memory_operand" "=m")
+ 	  (unspec:BLK [(match_operand:DF 1 "vfp_register_operand" "")]
+ 		      UNSPEC_PUSH_MULT))])]
+-  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
++  "TARGET_32BIT && TARGET_HARD_FLOAT"
+   "* return vfp_output_vstmd (operands);"
+   [(set_attr "type" "f_stored")]
+ )
 @@ -1368,6 +2012,20 @@
  )
  
@@ -59461,6 +65731,44 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  (define_insn "<fmaxmin><mode>3"
    [(set (match_operand:SDF 0 "s_register_operand" "=<F_constraint>")
  	(unspec:SDF [(match_operand:SDF 1 "s_register_operand" "<F_constraint>")
+@@ -1382,7 +2040,7 @@
+ ;; Write Floating-point Status and Control Register.
+ (define_insn "set_fpscr"
+   [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")] VUNSPEC_SET_FPSCR)]
+-  "TARGET_VFP && TARGET_HARD_FLOAT"
++  "TARGET_HARD_FLOAT"
+   "mcr\\tp10, 7, %0, cr1, cr0, 0\\t @SET_FPSCR"
+   [(set_attr "type" "mrs")])
+ 
+@@ -1390,7 +2048,7 @@
+ (define_insn "get_fpscr"
+   [(set (match_operand:SI 0 "register_operand" "=r")
+         (unspec_volatile:SI [(const_int 0)] VUNSPEC_GET_FPSCR))]
+-  "TARGET_VFP && TARGET_HARD_FLOAT"
++  "TARGET_HARD_FLOAT"
+   "mrc\\tp10, 7, %0, cr1, cr0, 0\\t @GET_FPSCR"
+   [(set_attr "type" "mrs")])
+ 
+--- a/src/gcc/config/i386/i386.c
++++ b/src/gcc/config/i386/i386.c
+@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "backend.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "cfghooks.h"
+ #include "cfgloop.h"
+--- a/src/gcc/config/ia64/ia64.c
++++ b/src/gcc/config/ia64/ia64.c
+@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "cfghooks.h"
+ #include "df.h"
+ #include "tm_p.h"
 --- a/src/gcc/config/linux.c
 +++ b/src/gcc/config/linux.c
 @@ -26,7 +26,7 @@ along with GCC; see the file COPYING3.  If not see
@@ -59472,9 +65780,78 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      return true;
    if (OPTION_BIONIC)
      if (fn_class == function_c94
+--- a/src/gcc/config/mips/mips.c
++++ b/src/gcc/config/mips/mips.c
+@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "cfghooks.h"
+ #include "df.h"
+--- a/src/gcc/config/rs6000/rs6000.c
++++ b/src/gcc/config/rs6000/rs6000.c
+@@ -24,6 +24,7 @@
+ #include "backend.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "cfghooks.h"
+ #include "cfgloop.h"
+--- a/src/gcc/config/sparc/sparc.c
++++ b/src/gcc/config/sparc/sparc.c
+@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "df.h"
+ #include "tm_p.h"
+--- a/src/gcc/configure
++++ b/src/gcc/configure
+@@ -1711,7 +1711,8 @@ Optional Packages:
+   --with-stabs            arrange to use stabs instead of host debug format
+   --with-dwarf2           force the default debug format to be DWARF 2
+   --with-specs=SPECS      add SPECS to driver command-line processing
+-  --with-pkgversion=PKG   Use PKG in the version string in place of "GCC"
++  --with-pkgversion=PKG   Use PKG in the version string in place of "Linaro
++                          GCC `cat $srcdir/LINARO-VERSION`"
+   --with-bugurl=URL       Direct users to URL to report a bug
+   --with-multilib-list    select multilibs (AArch64, SH and x86-64 only)
+   --with-gnu-ld           assume the C compiler uses GNU ld default=no
+@@ -7658,7 +7659,7 @@ if test "${with_pkgversion+set}" = set; then :
+       *)   PKGVERSION="($withval) " ;;
+      esac
+ else
+-  PKGVERSION="(GCC) "
++  PKGVERSION="(Linaro GCC `cat $srcdir/LINARO-VERSION`) "
+ 
+ fi
+ 
+@@ -18460,7 +18461,7 @@ else
+   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+   lt_status=$lt_dlunknown
+   cat > conftest.$ac_ext <<_LT_EOF
+-#line 18463 "configure"
++#line 18464 "configure"
+ #include "confdefs.h"
+ 
+ #if HAVE_DLFCN_H
+@@ -18566,7 +18567,7 @@ else
+   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+   lt_status=$lt_dlunknown
+   cat > conftest.$ac_ext <<_LT_EOF
+-#line 18569 "configure"
++#line 18570 "configure"
+ #include "confdefs.h"
+ 
+ #if HAVE_DLFCN_H
 --- a/src/gcc/configure.ac
 +++ b/src/gcc/configure.ac
-@@ -903,7 +903,7 @@ AC_ARG_WITH(specs,
+@@ -910,7 +910,7 @@ AC_ARG_WITH(specs,
  )
  AC_SUBST(CONFIGURE_SPECS)
  
@@ -59528,6 +65905,21 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    cpp_define_formatted (pfile, "__ATOMIC_RELAXED=%d", MEMMODEL_RELAXED);
    cpp_define_formatted (pfile, "__ATOMIC_SEQ_CST=%d", MEMMODEL_SEQ_CST);
    cpp_define_formatted (pfile, "__ATOMIC_ACQUIRE=%d", MEMMODEL_ACQUIRE);
+--- a/src/gcc/defaults.h
++++ b/src/gcc/defaults.h
+@@ -971,11 +971,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ #define REG_WORDS_BIG_ENDIAN WORDS_BIG_ENDIAN
+ #endif
+ 
+-#ifdef TARGET_FLT_EVAL_METHOD
+-#define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 1
+-#else
++#ifndef TARGET_FLT_EVAL_METHOD
+ #define TARGET_FLT_EVAL_METHOD 0
+-#define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 0
+ #endif
+ 
+ #ifndef TARGET_DEC_EVAL_METHOD
 --- a/src/gcc/expmed.c
 +++ b/src/gcc/expmed.c
 @@ -2522,16 +2522,8 @@ expand_variable_shift (enum tree_code code, machine_mode mode, rtx shifted,
@@ -59594,6 +65986,38 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        offset = offset + ((bitpos / BITS_PER_UNIT) & ~3);
        if (offset >= off
  	  && offset - off < len)
+--- a/src/gcc/fortran/options.c
++++ b/src/gcc/fortran/options.c
+@@ -208,8 +208,7 @@ gfc_post_options (const char **pfilename)
+ 
+   /* Excess precision other than "fast" requires front-end
+      support.  */
+-  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
+-      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
++  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
+     sorry ("-fexcess-precision=standard for Fortran");
+   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
+ 
+--- a/src/gcc/genconditions.c
++++ b/src/gcc/genconditions.c
+@@ -94,6 +94,7 @@ write_header (void)
+ #include \"resource.h\"\n\
+ #include \"diagnostic-core.h\"\n\
+ #include \"reload.h\"\n\
++#include \"memmodel.h\"\n\
+ #include \"tm-constrs.h\"\n");
+ 
+   if (saw_eh_return)
+--- a/src/gcc/genemit.c
++++ b/src/gcc/genemit.c
+@@ -792,6 +792,7 @@ from the machine description file `md'.  */\n\n");
+   printf ("#include \"reload.h\"\n");
+   printf ("#include \"diagnostic-core.h\"\n");
+   printf ("#include \"regs.h\"\n");
++  printf ("#include \"memmodel.h\"\n");
+   printf ("#include \"tm-constrs.h\"\n");
+   printf ("#include \"ggc.h\"\n");
+   printf ("#include \"dumpfile.h\"\n");
 --- a/src/gcc/genmultilib
 +++ b/src/gcc/genmultilib
 @@ -186,7 +186,8 @@ fi
@@ -59633,6 +66057,116 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      exit 1
    fi
  done
+--- a/src/gcc/genoutput.c
++++ b/src/gcc/genoutput.c
+@@ -231,6 +231,7 @@ output_prologue (void)
+   printf ("#include \"diagnostic-core.h\"\n");
+   printf ("#include \"output.h\"\n");
+   printf ("#include \"target.h\"\n");
++  printf ("#include \"memmodel.h\"\n");
+   printf ("#include \"tm-constrs.h\"\n");
+ }
+ 
+--- a/src/gcc/genpeep.c
++++ b/src/gcc/genpeep.c
+@@ -373,6 +373,7 @@ from the machine description file `md'.  */\n\n");
+   printf ("#include \"except.h\"\n");
+   printf ("#include \"diagnostic-core.h\"\n");
+   printf ("#include \"flags.h\"\n");
++  printf ("#include \"memmodel.h\"\n");
+   printf ("#include \"tm-constrs.h\"\n\n");
+ 
+   printf ("extern rtx peep_operand[];\n\n");
+--- a/src/gcc/genpreds.c
++++ b/src/gcc/genpreds.c
+@@ -1577,6 +1577,7 @@ write_insn_preds_c (void)
+ #include \"reload.h\"\n\
+ #include \"regs.h\"\n\
+ #include \"emit-rtl.h\"\n\
++#include \"memmodel.h\"\n\
+ #include \"tm-constrs.h\"\n");
+ 
+   FOR_ALL_PREDICATES (p)
+--- a/src/gcc/genrecog.c
++++ b/src/gcc/genrecog.c
+@@ -4172,6 +4172,7 @@ write_header (void)
+ #include \"diagnostic-core.h\"\n\
+ #include \"reload.h\"\n\
+ #include \"regs.h\"\n\
++#include \"memmodel.h\"\n\
+ #include \"tm-constrs.h\"\n\
+ \n");
+ 
+--- a/src/gcc/gimple-fold.c
++++ b/src/gcc/gimple-fold.c
+@@ -1379,6 +1379,55 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi,
+   return true;
+ }
+ 
++/* Simplify strchr (str, 0) into str + strlen (str).
++   In general strlen is significantly faster than strchr
++   due to being a simpler operation.  */
++static bool
++gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi)
++{
++  gimple *stmt = gsi_stmt (*gsi);
++  tree str = gimple_call_arg (stmt, 0);
++  tree c = gimple_call_arg (stmt, 1);
++  location_t loc = gimple_location (stmt);
++
++  if (optimize_function_for_size_p (cfun))
++    return false;
++
++  if (!integer_zerop (c) || !gimple_call_lhs (stmt))
++    return false;
++
++  tree len;
++  tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN);
++
++  if (!strlen_fn)
++    return false;
++
++  /* Create newstr = strlen (str).  */
++  gimple_seq stmts = NULL;
++  gimple *new_stmt = gimple_build_call (strlen_fn, 1, str);
++  gimple_set_location (new_stmt, loc);
++  if (gimple_in_ssa_p (cfun))
++    len = make_ssa_name (size_type_node);
++  else
++    len = create_tmp_reg (size_type_node);
++  gimple_call_set_lhs (new_stmt, len);
++  gimple_seq_add_stmt_without_update (&stmts, new_stmt);
++
++  /* Create (str p+ strlen (str)).  */
++  new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
++				  POINTER_PLUS_EXPR, str, len);
++  gimple_seq_add_stmt_without_update (&stmts, new_stmt);
++  gsi_replace_with_seq_vops (gsi, stmts);
++  /* gsi now points at the assignment to the lhs, get a
++     stmt iterator to the strlen.
++     ???  We can't use gsi_for_stmt as that doesn't work when the
++     CFG isn't built yet.  */
++  gimple_stmt_iterator gsi2 = *gsi;
++  gsi_prev (&gsi2);
++  fold_stmt (&gsi2);
++  return true;
++}
++
+ /* Simplify a call to the strcat builtin.  DST and SRC are the arguments
+    to the call.
+ 
+@@ -2820,6 +2869,11 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi)
+ 					 gimple_call_arg (stmt, 1));
+     case BUILT_IN_STRNCAT:
+       return gimple_fold_builtin_strncat (gsi);
++    case BUILT_IN_STRCHR:
++      if (gimple_fold_builtin_strchr (gsi))
++	return true;
++      /* Perform additional folding in builtin.c.  */
++      break;
+     case BUILT_IN_FPUTS:
+       return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0),
+ 					gimple_call_arg (stmt, 1), false);
 --- a/src/gcc/ifcvt.c
 +++ b/src/gcc/ifcvt.c
 @@ -813,10 +813,15 @@ struct noce_if_info
@@ -59960,6 +66494,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	  enum machine_mode m = smallest_mode_for_size (p, MODE_INT);
  	  tree optype = build_nonstandard_integer_type (GET_MODE_PRECISION (m),
  							uns0_p && uns1_p
+--- a/src/gcc/java/lang.c
++++ b/src/gcc/java/lang.c
+@@ -569,8 +569,7 @@ java_post_options (const char **pfilename)
+ 
+   /* Excess precision other than "fast" requires front-end
+      support.  */
+-  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
+-      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
++  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
+     sorry ("-fexcess-precision=standard for Java");
+   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
+ 
 --- a/src/gcc/lra-constraints.c
 +++ b/src/gcc/lra-constraints.c
 @@ -1326,7 +1326,22 @@ process_addr_reg (rtx *loc, bool check_only_p, rtx_insn **before, rtx_insn **aft
@@ -60064,7 +66610,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  void lto_promote_statics_nonwpa (void);
 --- a/src/gcc/lto/lto.c
 +++ b/src/gcc/lto/lto.c
-@@ -3117,9 +3117,10 @@ do_whole_program_analysis (void)
+@@ -3123,9 +3123,10 @@ do_whole_program_analysis (void)
    else if (flag_lto_partition == LTO_PARTITION_MAX)
      lto_max_map ();
    else if (flag_lto_partition == LTO_PARTITION_ONE)
@@ -60077,6 +66623,120 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    else
      gcc_unreachable ();
  
+--- a/src/gcc/match.pd
++++ b/src/gcc/match.pd
+@@ -468,6 +468,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+  (bit_and:c (convert? @0) (convert? (bit_not @0)))
+   { build_zero_cst (type); })
+ 
++/* PR71636: Transform x & ((1U << b) - 1) -> x & ~(~0U << b);  */
++(simplify
++  (bit_and:c @0 (plus:s (lshift:s integer_onep @1) integer_minus_onep))
++  (if (TYPE_UNSIGNED (type))
++    (bit_and @0 (bit_not (lshift { build_all_ones_cst (type); } @1)))))
++
+ /* Fold (A & ~B) - (A & B) into (A ^ B) - B.  */
+ (simplify
+  (minus (bit_and:cs @0 (bit_not @1)) (bit_and:cs @0 @1))
+--- /dev/null
++++ b/src/gcc/memmodel.h
+@@ -0,0 +1,86 @@
++/* Prototypes of memory model helper functions.
++   Copyright (C) 2015-2016 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_MEMMODEL_H
++#define GCC_MEMMODEL_H
++
++/* Return the memory model from a host integer.  */
++static inline enum memmodel
++memmodel_from_int (unsigned HOST_WIDE_INT val)
++{
++  return (enum memmodel) (val & MEMMODEL_MASK);
++}
++
++/* Return the base memory model from a host integer.  */
++static inline enum memmodel
++memmodel_base (unsigned HOST_WIDE_INT val)
++{
++  return (enum memmodel) (val & MEMMODEL_BASE_MASK);
++}
++
++/* Return TRUE if the memory model is RELAXED.  */
++static inline bool
++is_mm_relaxed (enum memmodel model)
++{
++  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELAXED;
++}
++
++/* Return TRUE if the memory model is CONSUME.  */
++static inline bool
++is_mm_consume (enum memmodel model)
++{
++  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_CONSUME;
++}
++
++/* Return TRUE if the memory model is ACQUIRE.  */
++static inline bool
++is_mm_acquire (enum memmodel model)
++{
++  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQUIRE;
++}
++
++/* Return TRUE if the memory model is RELEASE.  */
++static inline bool
++is_mm_release (enum memmodel model)
++{
++  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELEASE;
++}
++
++/* Return TRUE if the memory model is ACQ_REL.  */
++static inline bool
++is_mm_acq_rel (enum memmodel model)
++{
++  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQ_REL;
++}
++
++/* Return TRUE if the memory model is SEQ_CST.  */
++static inline bool
++is_mm_seq_cst (enum memmodel model)
++{
++  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_SEQ_CST;
++}
++
++/* Return TRUE if the memory model is a SYNC variant.  */
++static inline bool
++is_mm_sync (enum memmodel model)
++{
++  return (model & MEMMODEL_SYNC);
++}
++
++#endif  /* GCC_MEMMODEL_H  */
+--- a/src/gcc/optabs.c
++++ b/src/gcc/optabs.c
+@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "target.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "predict.h"
+ #include "tm_p.h"
+ #include "expmed.h"
 --- a/src/gcc/params.def
 +++ b/src/gcc/params.def
 @@ -1027,7 +1027,12 @@ DEFPARAM (PARAM_LTO_PARTITIONS,
@@ -60095,7 +66755,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 --- a/src/gcc/rtlanal.c
 +++ b/src/gcc/rtlanal.c
-@@ -3657,6 +3657,16 @@ subreg_get_info (unsigned int xregno, machine_mode xmode,
+@@ -3663,6 +3663,16 @@ subreg_get_info (unsigned int xregno, machine_mode xmode,
  	  info->offset = offset / regsize_xmode;
  	  return;
  	}
@@ -60112,7 +66772,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        /* Quick exit for the simple and common case of extracting whole
  	 subregisters from a multiregister value.  */
        /* ??? It would be better to integrate this into the code below,
-@@ -4584,13 +4594,14 @@ nonzero_bits1 (const_rtx x, machine_mode mode, const_rtx known_x,
+@@ -4590,13 +4600,14 @@ nonzero_bits1 (const_rtx x, machine_mode mode, const_rtx known_x,
  	  nonzero &= cached_nonzero_bits (SUBREG_REG (x), mode,
  					  known_x, known_mode, known_ret);
  
@@ -60134,7 +66794,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      /* On many CISC machines, accessing an object in a wider mode
 --- a/src/gcc/simplify-rtx.c
 +++ b/src/gcc/simplify-rtx.c
-@@ -5266,6 +5266,50 @@ simplify_const_relational_operation (enum rtx_code code,
+@@ -5274,6 +5274,50 @@ simplify_const_relational_operation (enum rtx_code code,
  
    return 0;
  }
@@ -60185,7 +66845,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  

  /* Simplify CODE, an operation with result mode MODE and three operands,
     OP0, OP1, and OP2.  OP0_MODE was the mode of OP0 before it became
-@@ -5399,6 +5443,19 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
+@@ -5407,6 +5451,19 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
  	    }
  	}
  
@@ -60205,6 +66865,18 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        if (COMPARISON_P (op0) && ! side_effects_p (op0))
  	{
  	  machine_mode cmp_mode = (GET_MODE (XEXP (op0, 0)) == VOIDmode
+--- a/src/gcc/system.h
++++ b/src/gcc/system.h
+@@ -971,7 +971,8 @@ extern void fancy_abort (const char *, int, const char *) ATTRIBUTE_NORETURN;
+ 	EXTRA_ADDRESS_CONSTRAINT CONST_DOUBLE_OK_FOR_CONSTRAINT_P	   \
+ 	CALLER_SAVE_PROFITABLE LARGEST_EXPONENT_IS_NORMAL		   \
+ 	ROUND_TOWARDS_ZERO SF_SIZE DF_SIZE XF_SIZE TF_SIZE LIBGCC2_TF_CEXT \
+-	LIBGCC2_LONG_DOUBLE_TYPE_SIZE STRUCT_VALUE EH_FRAME_IN_DATA_SECTION
++	LIBGCC2_LONG_DOUBLE_TYPE_SIZE STRUCT_VALUE			   \
++	EH_FRAME_IN_DATA_SECTION TARGET_FLT_EVAL_METHOD_NON_DEFAULT
+ 
+ /* Hooks that are no longer used.  */
+  #pragma GCC poison LANG_HOOKS_FUNCTION_MARK LANG_HOOKS_FUNCTION_FREE	\
 --- a/src/gcc/testsuite/c-c++-common/asan/clone-test-1.c
 +++ b/src/gcc/testsuite/c-c++-common/asan/clone-test-1.c
 @@ -29,6 +29,10 @@ int main(int argc, char **argv) {
@@ -60295,6 +66967,22 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  struct A { int &operator[] (long); };
  template <typename> struct B;
 --- /dev/null
++++ b/src/gcc/testsuite/g++.dg/opt/pr78201.C
+@@ -0,0 +1,13 @@
++// PR middle-end/78201
++// { dg-do compile }
++// { dg-options "-O2" }
++
++struct B { long d (); } *c;
++long e;
++
++void
++foo ()
++{
++  char a[e] = "";
++  c && c->d();
++}
+--- /dev/null
 +++ b/src/gcc/testsuite/gcc.c-torture/compile/pr71295.c
 @@ -0,0 +1,12 @@
 +extern void fn2 (long long);
@@ -60395,6 +67083,44 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  extern void abort (void);
  
+--- a/src/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
++++ b/src/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
+@@ -24,7 +24,7 @@
+ 			 | FE_OVERFLOW		\
+ 			 | FE_UNDERFLOW)
+ 
+-#if defined __alpha__
++#if defined __alpha__ || defined __aarch64__
+   #define ITER_COUNT 100
+ #else
+   #define ITER_COUNT 10000
+--- a/src/gcc/testsuite/gcc.dg/cpp/trad/include.c
++++ b/src/gcc/testsuite/gcc.dg/cpp/trad/include.c
+@@ -2,11 +2,5 @@
+ 
+ /* Test that macros are not expanded in the <> quotes of #inlcude.  */
+ 
+-/* vxWorksCommon.h uses the "#" operator to construct the name of an
+-   include file, thus making the file incompatible with -traditional-cpp.
+-   Newlib uses ## when including stdlib.h as of 2007-09-07.  */
+-/* { dg-do preprocess { target { { ! vxworks_kernel } && { ! newlib } } } } */
+-
+-#define __STDC__ 1		/* Stop complaints about non-ISO compilers.  */
+-#define stdlib 1
+-#include <stdlib.h>		/* { dg-bogus "o such file or directory" } */
++#define builtins 1
++#include <builtins.h>		/* { dg-bogus "o such file or directory" } */
+--- a/src/gcc/testsuite/gcc.dg/cpp/trad/trad.exp
++++ b/src/gcc/testsuite/gcc.dg/cpp/trad/trad.exp
+@@ -29,7 +29,7 @@ load_lib gcc-dg.exp
+ # If a testcase doesn't have special options, use these.
+ global DEFAULT_TRADCPPFLAGS
+ if ![info exists DEFAULT_TRADCPPFLAGS] then {
+-    set DEFAULT_TRADCPPFLAGS " -traditional-cpp"
++    set DEFAULT_TRADCPPFLAGS " -traditional-cpp -I$srcdir/$subdir/"
+ }
+ 
+ # Initialize `dg'.
 --- a/src/gcc/testsuite/gcc.dg/cpp/warn-undef-2.c
 +++ b/src/gcc/testsuite/gcc.dg/cpp/warn-undef-2.c
 @@ -1,5 +1,5 @@
@@ -60466,6 +67192,143 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +}
 +
 +/* { dg-final { scan-tree-dump "\\\[-INF, 0\\\]" "vrp1" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/pr71636-1.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++/* { dg-options "-fdump-tree-gimple" } */
++
++unsigned f(unsigned x, unsigned b)
++{
++  return x & ((1U << b) - 1);
++}
++
++/* { dg-final { scan-tree-dump-not "1 <<" "gimple" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/pr71636-2.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-forwprop-details" } */
++
++unsigned f(unsigned x, unsigned b)
++{
++  unsigned t1 = 1U << b;
++  unsigned t2 = t1 - 1;
++  unsigned t3 = x & t2;
++  return t3;
++}
++
++/* { dg-final { scan-tree-dump "_\[0-9\] = ~_\[0-9\]" "forwprop1" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-20.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-20.c
+@@ -86,9 +86,9 @@ main ()
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-21.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-21.c
+@@ -57,9 +57,9 @@ main ()
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-22.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-22.c
+@@ -31,9 +31,9 @@ main ()
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-22g.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-22g.c
+@@ -5,9 +5,9 @@
+ #define USE_GNU
+ #include "strlenopt-22.c"
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 1 "strlen" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-26.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-26.c
+@@ -21,4 +21,5 @@ main (void)
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-5.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-5.c
+@@ -48,9 +48,9 @@ main ()
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-7.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-7.c
+@@ -40,11 +40,11 @@ main ()
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
+--- a/src/gcc/testsuite/gcc.dg/strlenopt-9.c
++++ b/src/gcc/testsuite/gcc.dg/strlenopt-9.c
+@@ -98,10 +98,10 @@ main ()
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
+-/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
++/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
+ /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
 --- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-int-convert-alt.c
 +++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-int-convert-alt.c
 @@ -1,5 +1,6 @@
@@ -61388,7 +68251,21 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Various string construction helpers.  */
  
  /*
-@@ -81,7 +89,7 @@ extern size_t strlen(const char *);
+@@ -24,6 +32,13 @@ extern size_t strlen(const char *);
+    VECT_VAR(expected, int, 16, 4) -> expected_int16x4
+    VECT_VAR_DECL(expected, int, 16, 4) -> int16x4_t expected_int16x4
+ */
++/* Some instructions don't exist on ARM.
++   Use this macro to guard against them.  */
++#ifdef __aarch64__
++#define AARCH64_ONLY(X) X
++#else
++#define AARCH64_ONLY(X)
++#endif
+ 
+ #define xSTR(X) #X
+ #define STR(X) xSTR(X)
+@@ -81,7 +96,7 @@ extern size_t strlen(const char *);
  	  abort();							\
  	}								\
        }									\
@@ -61397,7 +68274,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    }
  
  /* Floating-point variant.  */
-@@ -110,7 +118,7 @@ extern size_t strlen(const char *);
+@@ -110,7 +125,7 @@ extern size_t strlen(const char *);
  	  abort();							\
  	}								\
        }									\
@@ -61406,7 +68283,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    }
  
  /* Clean buffer with a non-zero pattern to help diagnose buffer
-@@ -133,10 +141,16 @@ static ARRAY(result, uint, 32, 2);
+@@ -133,10 +148,16 @@ static ARRAY(result, uint, 32, 2);
  static ARRAY(result, uint, 64, 1);
  static ARRAY(result, poly, 8, 8);
  static ARRAY(result, poly, 16, 4);
@@ -61423,7 +68300,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  static ARRAY(result, int, 8, 16);
  static ARRAY(result, int, 16, 8);
  static ARRAY(result, int, 32, 4);
-@@ -147,6 +161,9 @@ static ARRAY(result, uint, 32, 4);
+@@ -147,6 +168,9 @@ static ARRAY(result, uint, 32, 4);
  static ARRAY(result, uint, 64, 2);
  static ARRAY(result, poly, 8, 16);
  static ARRAY(result, poly, 16, 8);
@@ -61433,7 +68310,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
  static ARRAY(result, float, 16, 8);
  #endif
-@@ -169,6 +186,7 @@ extern ARRAY(expected, poly, 8, 8);
+@@ -169,6 +193,7 @@ extern ARRAY(expected, poly, 8, 8);
  extern ARRAY(expected, poly, 16, 4);
  extern ARRAY(expected, hfloat, 16, 4);
  extern ARRAY(expected, hfloat, 32, 2);
@@ -61441,7 +68318,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  extern ARRAY(expected, int, 8, 16);
  extern ARRAY(expected, int, 16, 8);
  extern ARRAY(expected, int, 32, 4);
-@@ -335,7 +353,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
+@@ -335,7 +360,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
  	      strlen(COMMENT) > 0 ? " " COMMENT : "");			\
        abort();								\
      }									\
@@ -61451,7 +68328,73 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    }
  
  #define CHECK_CUMULATIVE_SAT_NAMED(test_name,EXPECTED,comment)		\
-@@ -500,15 +519,6 @@ static void clean_results (void)
+@@ -379,6 +405,9 @@ static void clean_results (void)
+   CLEAN(result, uint, 64, 1);
+   CLEAN(result, poly, 8, 8);
+   CLEAN(result, poly, 16, 4);
++#if defined (__ARM_FEATURE_CRYPTO)
++  CLEAN(result, poly, 64, 1);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+   CLEAN(result, float, 16, 4);
+ #endif
+@@ -394,6 +423,9 @@ static void clean_results (void)
+   CLEAN(result, uint, 64, 2);
+   CLEAN(result, poly, 8, 16);
+   CLEAN(result, poly, 16, 8);
++#if defined (__ARM_FEATURE_CRYPTO)
++  CLEAN(result, poly, 64, 2);
++#endif
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+   CLEAN(result, float, 16, 8);
+ #endif
+@@ -419,6 +451,13 @@ static void clean_results (void)
+ #define DECL_VARIABLE(VAR, T1, W, N)		\
+   VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
+ 
++#if defined (__ARM_FEATURE_CRYPTO)
++#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N) \
++  DECL_VARIABLE(VAR, T1, W, N)
++#else
++#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N)
++#endif
++
+ /* Declare only 64 bits signed variants.  */
+ #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
+   DECL_VARIABLE(VAR, int, 8, 8);			\
+@@ -454,6 +493,7 @@ static void clean_results (void)
+   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
+   DECL_VARIABLE(VAR, poly, 8, 8);		\
+   DECL_VARIABLE(VAR, poly, 16, 4);		\
++  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
+   DECL_VARIABLE(VAR, float, 16, 4);		\
+   DECL_VARIABLE(VAR, float, 32, 2)
+ #else
+@@ -462,6 +502,7 @@ static void clean_results (void)
+   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
+   DECL_VARIABLE(VAR, poly, 8, 8);		\
+   DECL_VARIABLE(VAR, poly, 16, 4);		\
++  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
+   DECL_VARIABLE(VAR, float, 32, 2)
+ #endif
+ 
+@@ -472,6 +513,7 @@ static void clean_results (void)
+   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
+   DECL_VARIABLE(VAR, poly, 8, 16);		\
+   DECL_VARIABLE(VAR, poly, 16, 8);		\
++  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
+   DECL_VARIABLE(VAR, float, 16, 8);		\
+   DECL_VARIABLE(VAR, float, 32, 4)
+ #else
+@@ -480,6 +522,7 @@ static void clean_results (void)
+   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
+   DECL_VARIABLE(VAR, poly, 8, 16);		\
+   DECL_VARIABLE(VAR, poly, 16, 8);		\
++  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
+   DECL_VARIABLE(VAR, float, 32, 4)
+ #endif
+ /* Declare all variants.  */
+@@ -500,15 +543,6 @@ static void clean_results (void)
  /* Helpers to initialize vectors.  */
  #define VDUP(VAR, Q, T1, T2, W, N, V)			\
    VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
@@ -61467,6 +68410,38 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  #define VSET_LANE(VAR, Q, T1, T2, W, N, L, V)				\
    VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
+@@ -521,6 +555,13 @@ static void clean_results (void)
+ 
+ /* Helpers to call macros with 1 constant and 5 variable
+    arguments.  */
++#if defined (__ARM_FEATURE_CRYPTO)
++#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N) \
++  MACRO(VAR1, VAR2, T1, T2, T3, W, N)
++#else
++#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N)
++#endif
++
+ #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
+   MACRO(VAR, , int, s, 8, 8);					\
+   MACRO(VAR, , int, s, 16, 4);					\
+@@ -591,13 +632,15 @@ static void clean_results (void)
+   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
+   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
+   MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
+-  MACRO(VAR1, VAR2, , poly, p, 16, 4)
++  MACRO(VAR1, VAR2, , poly, p, 16, 4);				\
++  MACRO_CRYPTO(MACRO, VAR1, VAR2, , poly, p, 64, 1)
+ 
+ #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
+   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
+   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
+   MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
+-  MACRO(VAR1, VAR2, q, poly, p, 16, 8)
++  MACRO(VAR1, VAR2, q, poly, p, 16, 8);				\
++  MACRO_CRYPTO(MACRO, VAR1, VAR2, q, poly, p, 64, 2)
+ 
+ #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
+   TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_float.inc
 @@ -0,0 +1,170 @@
@@ -62306,12 +69281,13 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
-@@ -0,0 +1,663 @@
+@@ -0,0 +1,1024 @@
 +/* This file contains tests for all the *p64 intrinsics, except for
 +   vreinterpret which have their own testcase.  */
 +
-+/* { dg-require-effective-target arm_crypto_ok } */
++/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
 +/* { dg-add-options arm_crypto } */
++/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
 +
 +#include <arm_neon.h>
 +#include "arm-neon-ref.h"
@@ -62347,6 +69323,17 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
 +						 0xfffffffffffffff2 };
 +
++/* Expected results: vmov_n.  */
++VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
++						 0xfffffffffffffff0 };
++VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
++						 0xfffffffffffffff1 };
++VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
++						 0xfffffffffffffff2 };
++
 +/* Expected results: vext.  */
 +VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
 +VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 };
@@ -62354,6 +69341,9 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* Expected results: vget_low.  */
 +VECT_VAR_DECL(vget_low_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
 +
++/* Expected results: vget_high.  */
++VECT_VAR_DECL(vget_high_expected,poly,64,1) [] = { 0xfffffffffffffff1 };
++
 +/* Expected results: vld1.  */
 +VECT_VAR_DECL(vld1_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
 +VECT_VAR_DECL(vld1_expected,poly,64,2) [] = { 0xfffffffffffffff0,
@@ -62418,6 +69408,39 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
 +						   0x3333333333333333 };
 +
++/* Expected results: vldX_lane.  */
++VECT_VAR_DECL(expected_vld_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(expected_vld_st2_0,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0xfffffffffffffff1 };
++VECT_VAR_DECL(expected_vld_st2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(expected_vld_st2_1,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
++						   0xaaaaaaaaaaaaaaaa };
++VECT_VAR_DECL(expected_vld_st3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(expected_vld_st3_0,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0xfffffffffffffff1 };
++VECT_VAR_DECL(expected_vld_st3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(expected_vld_st3_1,poly,64,2) [] = { 0xfffffffffffffff2,
++						   0xaaaaaaaaaaaaaaaa };
++VECT_VAR_DECL(expected_vld_st3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(expected_vld_st3_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
++						   0xaaaaaaaaaaaaaaaa };
++VECT_VAR_DECL(expected_vld_st4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
++VECT_VAR_DECL(expected_vld_st4_0,poly,64,2) [] = { 0xfffffffffffffff0,
++						   0xfffffffffffffff1 };
++VECT_VAR_DECL(expected_vld_st4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
++VECT_VAR_DECL(expected_vld_st4_1,poly,64,2) [] = { 0xfffffffffffffff2,
++						   0xfffffffffffffff3 };
++VECT_VAR_DECL(expected_vld_st4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
++VECT_VAR_DECL(expected_vld_st4_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
++						   0xaaaaaaaaaaaaaaaa };
++VECT_VAR_DECL(expected_vld_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
++VECT_VAR_DECL(expected_vld_st4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
++						   0xaaaaaaaaaaaaaaaa };
++
++/* Expected results: vget_lane.  */
++VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0;
++VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0;
++
 +int main (void)
 +{
 +  int i;
@@ -62650,6 +69673,26 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +
 +  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
 +
++  /* vget_high_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VGET_HIGH"
++
++#define TEST_VGET_HIGH(T1, T2, W, N, N2)					\
++  VECT_VAR(vget_high_vector64, T1, W, N) =				\
++    vget_high_##T2##W(VECT_VAR(vget_high_vector128, T1, W, N2));		\
++  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_high_vector64, T1, W, N))
++
++  DECL_VARIABLE(vget_high_vector64, poly, 64, 1);
++  DECL_VARIABLE(vget_high_vector128, poly, 64, 2);
++
++  CLEAN(result, poly, 64, 1);
++
++  VLOAD(vget_high_vector128, buffer, q, poly, p, 64, 2);
++
++  TEST_VGET_HIGH(poly, p, 64, 1, 2);
++
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vget_high_expected, "");
++
 +  /* vld1_p64 tests.  */
 +#undef TEST_MSG
 +#define TEST_MSG "VLD1/VLD1Q"
@@ -62954,7 +69997,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  VECT_VAR(vst1_lane_vector, T1, W, N) =				\
 +    vld1##Q##_##T2##W(VECT_VAR(buffer, T1, W, N));			\
 +  vst1##Q##_lane_##T2##W(VECT_VAR(result, T1, W, N),			\
-+			 VECT_VAR(vst1_lane_vector, T1, W, N), L)
++			 VECT_VAR(vst1_lane_vector, T1, W, N), L);
 +
 +  DECL_VARIABLE(vst1_lane_vector, poly, 64, 1);
 +  DECL_VARIABLE(vst1_lane_vector, poly, 64, 2);
@@ -62968,6 +70011,299 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  CHECK(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
 +  CHECK(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
 +
++#ifdef __aarch64__
++
++  /* vmov_n_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VMOV/VMOVQ"
++
++#define TEST_VMOV(Q, T1, T2, W, N)					\
++  VECT_VAR(vmov_n_vector, T1, W, N) =					\
++    vmov##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
++  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vmov_n_vector, T1, W, N))
++
++  DECL_VARIABLE(vmov_n_vector, poly, 64, 1);
++  DECL_VARIABLE(vmov_n_vector, poly, 64, 2);
++
++  /* Try to read different places from the input buffer.  */
++  for (i=0; i< 3; i++) {
++    CLEAN(result, poly, 64, 1);
++    CLEAN(result, poly, 64, 2);
++
++    TEST_VMOV(, poly, p, 64, 1);
++    TEST_VMOV(q, poly, p, 64, 2);
++
++    switch (i) {
++    case 0:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected0, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected0, "");
++      break;
++    case 1:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected1, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected1, "");
++      break;
++    case 2:
++      CHECK(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected2, "");
++      CHECK(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected2, "");
++      break;
++    default:
++      abort();
++    }
++  }
++
++  /* vget_lane_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VGET_LANE/VGETQ_LANE"
++
++#define TEST_VGET_LANE(Q, T1, T2, W, N, L)				   \
++  VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
++  if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) {		\
++    fprintf(stderr,							   \
++	    "ERROR in %s (%s line %d in result '%s') at type %s "	   \
++	    "got 0x%" PRIx##W " != 0x%" PRIx##W "\n",			   \
++	    TEST_MSG, __FILE__, __LINE__,				   \
++	    STR(VECT_VAR(vget_lane_expected, T1, W, N)),		   \
++	    STR(VECT_NAME(T1, W, N)),					   \
++	    VECT_VAR(vget_lane_vector, T1, W, N),			   \
++	    VECT_VAR(vget_lane_expected, T1, W, N));			   \
++    abort ();								   \
++  }
++
++  /* Initialize input values.  */
++  DECL_VARIABLE(vector, poly, 64, 1);
++  DECL_VARIABLE(vector, poly, 64, 2);
++
++  VLOAD(vector, buffer,  , poly, p, 64, 1);
++  VLOAD(vector, buffer, q, poly, p, 64, 2);
++
++  VECT_VAR_DECL(vget_lane_vector, poly, 64, 1);
++  VECT_VAR_DECL(vget_lane_vector, poly, 64, 2);
++
++  TEST_VGET_LANE( , poly, p, 64, 1, 0);
++  TEST_VGET_LANE(q, poly, p, 64, 2, 0);
++
++  /* vldx_lane_p64 tests.  */
++#undef TEST_MSG
++#define TEST_MSG "VLDX_LANE/VLDXQ_LANE"
++
++VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 64, 2);
++VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 64, 3);
++VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 64, 4);
++
++  /* In this case, input variables are arrays of vectors.  */
++#define DECL_VLD_STX_LANE(T1, W, N, X)					\
++  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
++  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
++  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
++
++  /* We need to use a temporary result buffer (result_bis), because
++     the one used for other tests is not large enough. A subset of the
++     result data is moved from result_bis to result, and it is this
++     subset which is used to check the actual behavior. The next
++     macro enables to move another chunk of data from result_bis to
++     result.  */
++  /* We also use another extra input buffer (buffer_src), which we
++     fill with 0xAA, and which it used to load a vector from which we
++     read a given lane.  */
++
++#define TEST_VLDX_LANE(Q, T1, T2, W, N, X, L)				\
++  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				\
++	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			\
++									\
++  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				\
++    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		\
++									\
++  VECT_ARRAY_VAR(vector, T1, W, N, X) =					\
++    /* Use dedicated init buffer, of size.  X */			\
++    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	\
++			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	\
++			     L);					\
++  vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		\
++		      VECT_ARRAY_VAR(vector, T1, W, N, X));		\
++  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
++	 sizeof(VECT_VAR(result, T1, W, N)))
++
++  /* Overwrite "result" with the contents of "result_bis"[Y].  */
++#undef TEST_EXTRA_CHUNK
++#define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
++  memcpy(VECT_VAR(result, T1, W, N),			\
++	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
++	 sizeof(VECT_VAR(result, T1, W, N)));
++
++  /* Add some padding to try to catch out of bound accesses.  */
++#define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
++#define DUMMY_ARRAY(V, T, W, N, L) \
++  VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
++  ARRAY1(V##_pad,T,W,N)
++
++#define DECL_ALL_VLD_STX_LANE(X)     \
++  DECL_VLD_STX_LANE(poly, 64, 1, X); \
++  DECL_VLD_STX_LANE(poly, 64, 2, X);
++
++#define TEST_ALL_VLDX_LANE(X)		  \
++  TEST_VLDX_LANE(, poly, p, 64, 1, X, 0); \
++  TEST_VLDX_LANE(q, poly, p, 64, 2, X, 0);
++
++#define TEST_ALL_EXTRA_CHUNKS(X,Y)	     \
++  TEST_EXTRA_CHUNK(poly, 64, 1, X, Y) \
++  TEST_EXTRA_CHUNK(poly, 64, 2, X, Y)
++
++#define CHECK_RESULTS_VLD_STX_LANE(test_name,EXPECTED,comment)	\
++  CHECK(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
++  CHECK(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);
++
++  /* Declare the temporary buffers / variables.  */
++  DECL_ALL_VLD_STX_LANE(2);
++  DECL_ALL_VLD_STX_LANE(3);
++  DECL_ALL_VLD_STX_LANE(4);
++
++  DUMMY_ARRAY(buffer_src, poly, 64, 1, 4);
++  DUMMY_ARRAY(buffer_src, poly, 64, 2, 4);
++
++  /* Check vld2_lane/vld2q_lane.  */
++  clean_results ();
++#undef TEST_MSG
++#define TEST_MSG "VLD2_LANE/VLD2Q_LANE"
++  TEST_ALL_VLDX_LANE(2);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_0, " chunk 0");
++
++  TEST_ALL_EXTRA_CHUNKS(2, 1);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_1, " chunk 1");
++
++  /* Check vld3_lane/vld3q_lane.  */
++  clean_results ();
++#undef TEST_MSG
++#define TEST_MSG "VLD3_LANE/VLD3Q_LANE"
++  TEST_ALL_VLDX_LANE(3);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_0, " chunk 0");
++
++  TEST_ALL_EXTRA_CHUNKS(3, 1);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_1, " chunk 1");
++
++  TEST_ALL_EXTRA_CHUNKS(3, 2);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_2, " chunk 2");
++
++  /* Check vld4_lane/vld4q_lane.  */
++  clean_results ();
++#undef TEST_MSG
++#define TEST_MSG "VLD4_LANE/VLD4Q_LANE"
++  TEST_ALL_VLDX_LANE(4);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_0, " chunk 0");
++
++  TEST_ALL_EXTRA_CHUNKS(4, 1);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_1, " chunk 1");
++  TEST_ALL_EXTRA_CHUNKS(4, 2);
++
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_2, " chunk 2");
++
++  TEST_ALL_EXTRA_CHUNKS(4, 3);
++  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_3, " chunk 3");
++
++  /* In this case, input variables are arrays of vectors.  */
++#define DECL_VSTX_LANE(T1, W, N, X)					\
++  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
++  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
++  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
++
++  /* We need to use a temporary result buffer (result_bis), because
++     the one used for other tests is not large enough. A subset of the
++     result data is moved from result_bis to result, and it is this
++     subset which is used to check the actual behavior. The next
++     macro enables to move another chunk of data from result_bis to
++     result.  */
++  /* We also use another extra input buffer (buffer_src), which we
++     fill with 0xAA, and which it used to load a vector from which we
++     read a given lane.  */
++#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)				 \
++  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				 \
++	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			 \
++  memset (VECT_VAR(result_bis_##X, T1, W, N), 0,			 \
++	  sizeof(VECT_VAR(result_bis_##X, T1, W, N)));			 \
++									 \
++  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				 \
++    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		 \
++									 \
++  VECT_ARRAY_VAR(vector, T1, W, N, X) =					 \
++    /* Use dedicated init buffer, of size X.  */			 \
++    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	 \
++			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	 \
++			     L);					 \
++  vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		 \
++			   VECT_ARRAY_VAR(vector, T1, W, N, X),		 \
++			   L);						 \
++  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
++	 sizeof(VECT_VAR(result, T1, W, N)));
++
++#define TEST_ALL_VSTX_LANE(X)		  \
++  TEST_VSTX_LANE(, poly, p, 64, 1, X, 0); \
++  TEST_VSTX_LANE(q, poly, p, 64, 2, X, 0);
++
++  /* Check vst2_lane/vst2q_lane.  */
++  clean_results ();
++#undef TEST_MSG
++#define TEST_MSG "VST2_LANE/VST2Q_LANE"
++  TEST_ALL_VSTX_LANE(2);
++
++#define CMT " (chunk 0)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_0, CMT);
++
++  TEST_ALL_EXTRA_CHUNKS(2, 1);
++#undef CMT
++#define CMT " chunk 1"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_1, CMT);
++
++  /* Check vst3_lane/vst3q_lane.  */
++  clean_results ();
++#undef TEST_MSG
++#define TEST_MSG "VST3_LANE/VST3Q_LANE"
++  TEST_ALL_VSTX_LANE(3);
++
++#undef CMT
++#define CMT " (chunk 0)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_0, CMT);
++
++  TEST_ALL_EXTRA_CHUNKS(3, 1);
++
++#undef CMT
++#define CMT " (chunk 1)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_1, CMT);
++
++  TEST_ALL_EXTRA_CHUNKS(3, 2);
++
++#undef CMT
++#define CMT " (chunk 2)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_2, CMT);
++
++  /* Check vst4_lane/vst4q_lane.  */
++  clean_results ();
++#undef TEST_MSG
++#define TEST_MSG "VST4_LANE/VST4Q_LANE"
++  TEST_ALL_VSTX_LANE(4);
++
++#undef CMT
++#define CMT " (chunk 0)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_0, CMT);
++
++  TEST_ALL_EXTRA_CHUNKS(4, 1);
++
++#undef CMT
++#define CMT " (chunk 1)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_1, CMT);
++
++  TEST_ALL_EXTRA_CHUNKS(4, 2);
++
++#undef CMT
++#define CMT " (chunk 2)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_2, CMT);
++
++  TEST_ALL_EXTRA_CHUNKS(4, 3);
++
++#undef CMT
++#define CMT " (chunk 3)"
++  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_3, CMT);
++
++#endif /* __aarch64__.  */
++
 +  return 0;
 +}
 --- /dev/null
@@ -74476,11 +81812,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  int main (void)
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p128.c
-@@ -0,0 +1,166 @@
+@@ -0,0 +1,165 @@
 +/* This file contains tests for the vreinterpret *p128 intrinsics.  */
 +
-+/* { dg-require-effective-target arm_crypto_ok } */
++/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
 +/* { dg-add-options arm_crypto } */
++/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
 +
 +#include <arm_neon.h>
 +#include "arm-neon-ref.h"
@@ -74557,9 +81894,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +int main (void)
 +{
 +  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector);
-+  DECL_VARIABLE(vreint_vector, poly, 64, 2);
 +  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector_res);
-+  DECL_VARIABLE(vreint_vector_res, poly, 64, 2);
 +
 +  clean_results ();
 +
@@ -74645,11 +81980,12 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +}
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p64.c
-@@ -0,0 +1,212 @@
+@@ -0,0 +1,209 @@
 +/* This file contains tests for the vreinterpret *p64 intrinsics.  */
 +
-+/* { dg-require-effective-target arm_crypto_ok } */
++/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
 +/* { dg-add-options arm_crypto } */
++/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
 +
 +#include <arm_neon.h>
 +#include "arm-neon-ref.h"
@@ -74769,11 +82105,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
 +
 +  DECL_VARIABLE_ALL_VARIANTS(vreint_vector);
-+  DECL_VARIABLE(vreint_vector, poly, 64, 1);
-+  DECL_VARIABLE(vreint_vector, poly, 64, 2);
 +  DECL_VARIABLE_ALL_VARIANTS(vreint_vector_res);
-+  DECL_VARIABLE(vreint_vector_res, poly, 64, 1);
-+  DECL_VARIABLE(vreint_vector_res, poly, 64, 2);
 +
 +  clean_results ();
 +
@@ -78740,6 +86072,30 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +}
 +
 +/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_17.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 --save-temps" } */
++
++/* Test reuse of stack adjustment temporaries.  */
++
++void foo ();
++
++int reuse_mov (int i)
++{
++  int arr[1025];
++  return arr[i];
++}
++
++int no_reuse_mov (int i)
++{
++  int arr[1025];
++  foo ();
++  return arr[i];
++}
++
++/* { dg-final { scan-assembler-times "mov\tx16, \[0-9\]+" 3 } } */
 --- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
 +++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
 @@ -3,8 +3,7 @@
@@ -79287,6 +86643,122 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +#endif
 +
 --- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp22.c
+@@ -0,0 +1,28 @@
++/* Test AAPCS layout (VFP variant)  */
++
++/* { dg-do run { target arm_eabi } }  */
++/* { dg-require-effective-target arm_hard_vfp_ok }  */
++/* { dg-require-effective-target arm_fp16_hw }  */
++/* { dg-add-options arm_fp16_alternative }  */
++
++#ifndef IN_FRAMEWORK
++#define VFP
++#define TESTFILE "vfp22.c"
++#include "abitest.h"
++
++#else
++#if defined (__ARM_BIG_ENDIAN)
++ARG (__fp16, 1.0f, S0 + 2)
++#else
++ARG (__fp16, 1.0f, S0)
++#endif
++ARG (float, 2.0f, S1)
++ARG (double, 4.0, D1)
++ARG (float, 2.0f, S4)
++#if defined (__ARM_BIG_ENDIAN)
++ARG (__fp16, 1.0f, S5 + 2)
++#else
++ARG (__fp16, 1.0f, S5)
++#endif
++LAST_ARG (int, 3, R0)
++#endif
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp23.c
+@@ -0,0 +1,30 @@
++/* Test AAPCS layout (VFP variant)  */
++
++/* { dg-do run { target arm_eabi } }  */
++/* { dg-require-effective-target arm_hard_vfp_ok }  */
++/* { dg-require-effective-target arm_fp16_hw }  */
++/* { dg-add-options arm_fp16_alternative }  */
++
++#ifndef IN_FRAMEWORK
++#define VFP
++#define TESTFILE "vfp23.c"
++
++__complex__ x = 1.0+2.0i;
++
++#include "abitest.h"
++#else
++#if defined (__ARM_BIG_ENDIAN)
++ARG (__fp16, 1.0f, S0 + 2)
++#else
++ARG (__fp16, 1.0f, S0)
++#endif
++ARG (float, 2.0f, S1)
++ARG (__complex__ double, x, D1)
++ARG (float, 3.0f, S6)
++#if defined (__ARM_BIG_ENDIAN)
++ARG (__fp16, 2.0f, S7 + 2)
++#else
++ARG (__fp16, 2.0f, S7)
++#endif
++LAST_ARG (int, 3, R0)
++#endif
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp24.c
+@@ -0,0 +1,21 @@
++/* Test AAPCS layout (VFP variant)  */
++
++/* { dg-do run { target arm_eabi } }  */
++/* { dg-require-effective-target arm_hard_vfp_ok }  */
++/* { dg-require-effective-target arm_fp16_hw }  */
++/* { dg-add-options arm_fp16_alternative }  */
++
++#ifndef IN_FRAMEWORK
++#define VFP
++#define TESTFILE "vfp24.c"
++
++#define PCSATTR __attribute__((pcs("aapcs")))
++
++#include "abitest.h"
++#else
++ARG (float, 1.0f, R0)
++ARG (double, 2.0, R2)
++ARG (float, 3.0f, STACK)
++ARG (__fp16, 2.0f, STACK+4)
++LAST_ARG (double, 4.0, STACK+8)
++#endif
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp25.c
+@@ -0,0 +1,25 @@
++/* Test AAPCS layout (VFP variant)  */
++
++/* { dg-do run { target arm_eabi } }  */
++/* { dg-require-effective-target arm_hard_vfp_ok }  */
++/* { dg-require-effective-target arm_fp16_hw }  */
++/* { dg-add-options arm_fp16_alternative }  */
++
++#ifndef IN_FRAMEWORK
++#define VFP
++#define TESTFILE "vfp25.c"
++
++#define PCSATTR __attribute__((pcs("aapcs")))
++
++#include "abitest.h"
++#else
++#if defined (__ARM_BIG_ENDIAN)
++ARG (__fp16, 1.0f, R0 + 2)
++#else
++ARG (__fp16, 1.0f, R0)
++#endif
++ARG (double, 2.0, R2)
++ARG (__fp16, 3.0f, STACK)
++ARG (float, 2.0f, STACK+4)
++LAST_ARG (double, 4.0, STACK+8)
++#endif
+--- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/arm/armv5_thumb_isa.c
 @@ -0,0 +1,8 @@
 +/* { dg-require-effective-target arm_arch_v5_ok } */
@@ -80476,6 +87948,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* { dg-final { scan-assembler-times "ldaex" 4 } } */
 +/* { dg-final { scan-assembler-times "stlex" 4 } } */
 +/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2 -fno-ipa-icf" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-comp-swap-release-acquire.x"
++
++/* { dg-final { scan-assembler-times "ldaex" 4 } } */
++/* { dg-final { scan-assembler-times "stlex" 4 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
 --- a/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire.c
 +++ b/src//dev/null
 @@ -1,10 +0,0 @@
@@ -80515,6 +88000,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-acq_rel.x"
++
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
 --- a/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel.c
 +++ b/src//dev/null
 @@ -1,10 +0,0 @@
@@ -80554,6 +88052,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-acquire.x"
++
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
 --- a/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire.c
 +++ b/src//dev/null
 @@ -1,10 +0,0 @@
@@ -80593,6 +88104,19 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-char.x"
++
++/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
 --- a/src/gcc/testsuite/gcc.target/arm/atomic-op-char.c
 +++ b/src//dev/null
 @@ -1,10 +0,0 @@
@@ -80621,468 +88145,3155 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
 +/* { dg-final { scan-assembler-not "dmb" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-2.c
-@@ -0,0 +1,11 @@
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-2.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_main } */
++
++#include "../aarch64/atomic-op-consume.x"
++
++/* Scan for ldaex is a PR59448 consume workaround.  */
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-3.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-consume.x"
++
++/* Scan for ldaex is a PR59448 consume workaround.  */
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-consume.c
++++ b/src//dev/null
+@@ -1,11 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-require-effective-target arm_arch_v8a_ok } */
+-/* { dg-options "-O2" } */
+-/* { dg-add-options arm_arch_v8a } */
+-
+-#include "../aarch64/atomic-op-consume.x"
+-
+-/* Scan for ldaex is a PR59448 consume workaround.  */
+-/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-1.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8a_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8a } */
++
++#include "../aarch64/atomic-op-int.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-2.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_main } */
++
++#include "../aarch64/atomic-op-int.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-int.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-int.c
++++ b/src//dev/null
+@@ -1,10 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-require-effective-target arm_arch_v8a_ok } */
+-/* { dg-options "-O2" } */
+-/* { dg-add-options arm_arch_v8a } */
+-
+-#include "../aarch64/atomic-op-int.x"
+-
+-/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-1.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8a_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8a } */
++
++#include "../aarch64/atomic-op-relaxed.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-2.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_main } */
++
++#include "../aarch64/atomic-op-relaxed.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-relaxed.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed.c
++++ b/src//dev/null
+@@ -1,10 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-require-effective-target arm_arch_v8a_ok } */
+-/* { dg-options "-O2" } */
+-/* { dg-add-options arm_arch_v8a } */
+-
+-#include "../aarch64/atomic-op-relaxed.x"
+-
+-/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-1.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8a_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8a } */
++
++#include "../aarch64/atomic-op-release.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-2.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_main } */
++
++#include "../aarch64/atomic-op-release.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-release.x"
++
++/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-release.c
++++ b/src//dev/null
+@@ -1,10 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-require-effective-target arm_arch_v8a_ok } */
+-/* { dg-options "-O2" } */
+-/* { dg-add-options arm_arch_v8a } */
+-
+-#include "../aarch64/atomic-op-release.x"
+-
+-/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-1.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8a_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8a } */
++
++#include "../aarch64/atomic-op-seq_cst.x"
++
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-2.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_main } */
++
++#include "../aarch64/atomic-op-seq_cst.x"
++
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-seq_cst.x"
++
++/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst.c
++++ b/src//dev/null
+@@ -1,10 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-require-effective-target arm_arch_v8a_ok } */
+-/* { dg-options "-O2" } */
+-/* { dg-add-options arm_arch_v8a } */
+-
+-#include "../aarch64/atomic-op-seq_cst.x"
+-
+-/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-1.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8a_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8a } */
++
++#include "../aarch64/atomic-op-short.x"
++
++/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-2.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_main } */
++
++#include "../aarch64/atomic-op-short.x"
++
++/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v8m_base } */
++
++#include "../aarch64/atomic-op-short.x"
++
++/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
++/* { dg-final { scan-assembler-not "dmb" } } */
+--- a/src/gcc/testsuite/gcc.target/arm/atomic-op-short.c
++++ b/src//dev/null
+@@ -1,10 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-require-effective-target arm_arch_v8a_ok } */
+-/* { dg-options "-O2" } */
+-/* { dg-add-options arm_arch_v8a } */
+-
+-#include "../aarch64/atomic-op-short.x"
+-
+-/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
+-/* { dg-final { scan-assembler-not "dmb" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/attr-fp16-arith-1.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_v8_2a_fp16_scalar } */
++
++/* Reset fpu to a value compatible with the next pragmas.  */
++#pragma GCC target ("fpu=vfp")
++
++#pragma GCC push_options
++#pragma GCC target ("fpu=fp-armv8")
++
++#ifndef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
++#error __ARM_FEATURE_FP16_SCALAR_ARITHMETIC not defined.
++#endif
++
++#pragma GCC push_options
++#pragma GCC target ("fpu=neon-fp-armv8")
++
++#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
++#error __ARM_FEATURE_FP16_VECTOR_ARITHMETIC not defined.
++#endif
++
++#ifndef __ARM_NEON
++#error __ARM_NEON not defined.
++#endif
++
++#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
++#error Invalid value for __ARM_FP
++#endif
++
++#include "arm_neon.h"
++
++float16_t
++foo (float16x4_t b)
++{
++  float16x4_t a = {2.0, 3.0, 4.0, 5.0};
++  float16x4_t res = vadd_f16 (a, b);
++
++  return res[0];
++}
++
++/* { dg-final { scan-assembler "vadd\\.f16\td\[0-9\]+, d\[0-9\]+" } } */
++
++#pragma GCC pop_options
++
++/* Check that the FP version is correctly reset to mfpu=fp-armv8.  */
++
++#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
++#error __ARM_FP should record FP16 support.
++#endif
++
++#pragma GCC pop_options
++
++/* Check that the FP version is correctly reset to mfpu=vfp.  */
++
++#if !defined (__ARM_FP) || (__ARM_FP & 0x2)
++#error Unexpected value for __ARM_FP.
++#endif
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddl.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++long overflow_add (long x, long y)
++{
++  long r;
++
++  int ovr = __builtin_saddl_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "adds" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddll.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++long long overflow_add (long long x, long long y)
++{
++  long long r;
++
++  int ovr = __builtin_saddll_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "adds" } } */
++/* { dg-final { scan-assembler "adcs" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubl.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++long overflow_sub (long x, long y)
++{
++  long r;
++
++  int ovr = __builtin_ssubl_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "subs" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubll.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++long long overflow_sub (long long x, long long y)
++{
++  long long r;
++
++  int ovr = __builtin_ssubll_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "subs" } } */
++/* { dg-final { scan-assembler "sbcs" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddl.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++unsigned long overflow_add (unsigned long x, unsigned long y)
++{
++  unsigned long r;
++
++  int ovr = __builtin_uaddl_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "adds" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddll.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++unsigned long long overflow_add (unsigned long long x, unsigned long long y)
++{
++  unsigned long long r;
++
++  int ovr = __builtin_uaddll_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "adds" } } */
++/* { dg-final { scan-assembler "adcs" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubl.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++unsigned long overflow_sub (unsigned long x, unsigned long y)
++{
++  unsigned long r;
++
++  int ovr = __builtin_usubl_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "subs" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubll.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" }  */
++/* { dg-require-effective-target arm32 } */
++extern void overflow_handler ();
++
++unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
++{
++  unsigned long long r;
++
++  int ovr = __builtin_usubll_overflow (x, y, &r);
++  if (ovr)
++    overflow_handler ();
++
++  return r;
++}
++
++/* { dg-final { scan-assembler "subs" } } */
++/* { dg-final { scan-assembler "sbcs" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cbz.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile {target { arm_thumb2 || arm_thumb1_cbz_ok } } } */
++/* { dg-options "-O2" } */
++
++int
++foo (int a, int *b)
++{
++  if (a)
++    *b = 1;
++  return 0;
++}
++
++/* { dg-final { scan-assembler-times "cbz\\tr\\d" 1 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c
+@@ -0,0 +1,57 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char a;
++  unsigned int b:5;
++  unsigned int c:11, :0, d:8;
++  struct { unsigned int ee:2; } e;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++extern void foo (test_st st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++  r.values.v4 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr4, 255" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #255" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #3" } } */
++/* { dg-final { scan-assembler "ands\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned short  b :5;
++  unsigned char	  c;
++  unsigned short  d :11;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
++/* { dg-final { scan-assembler "movt\tr4, 255" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #2047" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr2, r4" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char a;
++  unsigned int	b : 3;
++  unsigned int	c : 14;
++  unsigned int	d : 1;
++  struct {
++      unsigned int    ee  : 2;
++      unsigned short  ff  : 15;
++  } e;
++  unsigned char	g : 1;
++  unsigned char	  : 4;
++  unsigned char	h : 3;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++  r.values.v4 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr4, 1023" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #3" } } */
++/* { dg-final { scan-assembler "movt\tr4, 32767" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #255" } } */
++/* { dg-final { scan-assembler "ands\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c
+@@ -0,0 +1,54 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned short  b :5;
++  unsigned char	  c;
++  unsigned short  d :11;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
++/* { dg-final { scan-assembler "movt\tr4, 255" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #2047" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr2, r4" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c
+@@ -0,0 +1,57 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned int	    :0;
++  unsigned int	  b :1;
++  unsigned short    :0;
++  unsigned short  c;
++  unsigned int	    :0;
++  unsigned int	  d :21;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #255" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #1" } } */
++/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr4, 31" } } */
++/* { dg-final { scan-assembler "ands\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c
+@@ -0,0 +1,56 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  char a:3;
++} test_st3;
++
++typedef struct
++{
++  char a:3;
++} test_st2;
++
++typedef struct
++{
++  test_st2 st2;
++  test_st3 st3;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #1799" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr1, r4" } } */
++/* { dg-final { scan-assembler "movs\tr2, r4" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c
+@@ -0,0 +1,96 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned short a :11;
++} test_st_4;
++
++typedef union
++{
++  char	      a;
++  test_st_4 st4;
++}test_un_2;
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned int	    :0;
++  unsigned int	  b :1;
++  unsigned short    :0;
++  unsigned short  c;
++  unsigned int	    :0;
++  unsigned int	  d :21;
++} test_st_3;
++
++typedef struct
++{
++  unsigned char	  a :3;
++  unsigned int	  b :13;
++  test_un_2	  un2;
++} test_st_2;
++
++typedef union
++{
++  test_st_2 st2;
++  test_st_3 st3;
++}test_un_1;
++
++typedef struct
++{
++  unsigned char	  a :2;
++  unsigned char	    :0;
++  unsigned short  b :5;
++  unsigned char	    :0;
++  unsigned char	  c :4;
++  test_un_1	  un1;
++} test_st_1;
++
++typedef union
++{
++  test_st_1 st1;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st_1;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1);
++
++int
++main (void)
++{
++  read_st_1 r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++  r.values.v4 = 0xFFFFFFFF;
++
++  f (r.st1);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #7939" } } */
++/* { dg-final { scan-assembler "movt\tr4, 15" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr4, 2047" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "movs\tr4, #1" } } */
++/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
++/* { dg-final { scan-assembler "ands\tr2, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr4, 31" } } */
++/* { dg-final { scan-assembler "ands\tr3, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-add-options arm_arch_v8m_base } */
++/* { dg-options "-mcmse" }  */
++
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (int);
++
++int
++foo (int a)
++{
++  return bar (bar (a + 1));
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr1, r4" } } */
++/* { dg-final { scan-assembler "movs\tr2, r4" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-add-options arm_arch_v8m_base } */
++/* { dg-options "-mcmse" } */
++
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
++
++int
++foo (int a)
++{
++  return bar (1.0f, 2.0) + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler-not "movs\tr0, r4" } } */
++/* { dg-final { scan-assembler "\n\tmovs\tr1, r4" } } */
++/* { dg-final { scan-assembler-not "\n\tmovs\tr2, r4\n\tmovs\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vmov" } } */
++/* { dg-final { scan-assembler-not "vmsr" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-add-options arm_arch_v8m_base } */
++/* { dg-options "-mcmse" }  */
++
++extern float bar (void);
++
++float __attribute__ ((cmse_nonsecure_entry))
++foo (void)
++{
++  return bar ();
++}
++/* { dg-final { scan-assembler "movs\tr1, r0" } } */
++/* { dg-final { scan-assembler "movs\tr2, r0" } } */
++/* { dg-final { scan-assembler "movs\tr3, r0" } } */
++/* { dg-final { scan-assembler "mov\tip, r0" } } */
++/* { dg-final { scan-assembler "mov\tlr, r0" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq," } } */
++/* { dg-final { scan-assembler "bxns" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-add-options arm_arch_v8m_base } */
++/* { dg-options "-mcmse" }  */
++
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
++
++int
++foo (int a)
++{
++  return bar (2.0) + a + 1;
++}
++
++/* Remember dont clear r0 and r1, because we are passing the double parameter
++ * for bar in them.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr2, r4" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_base_ok } */
++/* { dg-add-options arm_arch_v8m_base } */
++/* { dg-options "-mcmse -mfloat-abi=softfp" } */
++
++double __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
++
++double
++foo (double a)
++{
++  return bar (1.0f, 2.0) + a;
++}
++
++float __attribute__ ((cmse_nonsecure_entry))
++baz (float a, double b)
++{
++  return (float) bar (a, b);
++}
++
++/* Make sure we are not using FP instructions, since ARMv8-M Baseline does not
++   support such instructions.  */
++/* { dg-final { scan-assembler-not "vmov" } } */
++/* { dg-final { scan-assembler-not "vmsr" } } */
++/* { dg-final { scan-assembler-not "vmrs" } } */
++
++/* Just double checking that we are still doing cmse though.  */
++/* { dg-final { scan-assembler-not "vmrs" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a :2;
++  unsigned char	    :0;
++  unsigned short  b :5;
++  unsigned char	    :0;
++  unsigned short  c :3;
++  unsigned char	    :0;
++  unsigned int	  d :9;
++} test_st_1;
++
++typedef struct
++{
++  unsigned short  a :7;
++  unsigned char	    :0;
++  unsigned char	  b :1;
++  unsigned char	    :0;
++  unsigned short  c :6;
++} test_st_2;
++
++typedef union
++{
++  test_st_1 st_1;
++  test_st_2 st_2;
++}test_un;
++
++typedef union
++{
++  test_un un;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_un;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
++
++int
++main (void)
++{
++  read_un r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++
++  f (r.un);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #8063" } } */
++/* { dg-final { scan-assembler "movt\tr4, 63" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #511" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr2, r4" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c
+@@ -0,0 +1,86 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a :2;
++  unsigned char	    :0;
++  unsigned short  b :5;
++  unsigned char	    :0;
++  unsigned short  c :3;
++  unsigned char	    :0;
++  unsigned int	  d :9;
++} test_st_1;
++
++typedef struct
++{
++  unsigned short  a :7;
++  unsigned char	    :0;
++  unsigned char	  b :1;
++  unsigned char	    :0;
++  unsigned short  c :6;
++} test_st_2;
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned int	    :0;
++  unsigned int	  b :1;
++  unsigned short    :0;
++  unsigned short  c;
++  unsigned int	    :0;
++  unsigned int	  d :21;
++} test_st_3;
++
++typedef union
++{
++  test_st_1 st_1;
++  test_st_2 st_2;
++  test_st_3 st_3;
++}test_un;
++
++typedef union
++{
++  test_un un;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_un;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
++
++int
++main (void)
++{
++  read_un r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++
++  f (r.un);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
++/* { dg-final { scan-assembler "movt\tr4, 63" } } */
++/* { dg-final { scan-assembler "ands\tr0, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #511" } } */
++/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
++/* { dg-final { scan-assembler "ands\tr1, r4" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr4, 31" } } */
++/* { dg-final { scan-assembler "ands\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr4, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "movs\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c
+@@ -0,0 +1,39 @@
++/* { dg-do run } */
++/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
++
++typedef struct
++{
++  unsigned short  a : 6;
++  unsigned char	  b : 3;
++  unsigned char	  c;
++  unsigned short  d : 8;
++} test_st;
++
++test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
++{
++  test_st t;
++  t.a = 63u;
++  t.b = 7u;
++  t.c = 255u;
++  t.d = 255u;
++  return t;
++}
++
++int
++main (void)
++{
++  test_st t;
++  t = foo ();
++  if (t.a != 63u
++      || t.b != 7u
++      || t.c != 255u
++      || t.d != 255u)
++    __builtin_abort ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tr1, #1855" } } */
++/* { dg-final { scan-assembler "movt\tr1, 65535" } } */
++/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
++/* { dg-final { scan-assembler "bxns" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c
+@@ -0,0 +1,36 @@
++/* { dg-do run } */
++/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
++
++typedef struct
++{
++  short	      a : 7;
++  signed char b : 3;
++  short	      c : 11;
++} test_st;
++
++test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
++{
++  test_st t;
++  t.a = -64;
++  t.b = -4 ;
++  t.c = -1024;
++  return t;
++}
++
++int
++main (void)
++{
++  test_st t;
++  t = foo ();
++  if (t.a != -64
++      || t.b != -4
++      || t.c != -1024)
++    __builtin_abort ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tr1, #1919" } } */
++/* { dg-final { scan-assembler "movt\tr1, 2047" } } */
++/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
++/* { dg-final { scan-assembler "bxns" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c
+@@ -0,0 +1,37 @@
++/* { dg-do run } */
++/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
++
++typedef struct
++{
++  short	      a;
++  signed char b : 2;
++  short		: 1;
++  signed char c : 3;
++} test_st;
++
++test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
++{
++  test_st t;
++  t.a = -32768;
++  t.b = -2;
++  t.c = -4;
++  return t;
++}
++
++int
++main (void)
++{
++  test_st t;
++  t = foo ();
++  if (t.a != -32768
++      || t.b != -2
++      || t.c != -4)
++    __builtin_abort ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tr1, #65535" } } */
++/* { dg-final { scan-assembler "movt\tr1, 63" } } */
++/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
++/* { dg-final { scan-assembler "bxns" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c
+@@ -0,0 +1,106 @@
++/* { dg-do compile } */
++/* { dg-options "-Os -mcmse -fdump-rtl-expand" }  */
++
++#include <arm_cmse.h>
++
++extern int a;
++extern int bar (void);
++
++int foo (char * p)
++{
++  cmse_address_info_t cait;
++
++  cait = cmse_TT (&a);
++  if (cait.flags.mpu_region)
++    a++;
++
++  cait = cmse_TT_fptr (&bar);
++  if (cait.flags.mpu_region)
++    a+= bar ();
++
++  cait = cmse_TTA (&a);
++  if (cait.flags.mpu_region)
++    a++;
++
++  cait = cmse_TTA_fptr (&bar);
++  if (cait.flags.mpu_region)
++    a+= bar ();
++
++  cait = cmse_TTT (&a);
++  if (cait.flags.mpu_region)
++    a++;
++
++  cait = cmse_TTT_fptr (&bar);
++  if (cait.flags.mpu_region)
++    a+= bar ();
++
++  cait = cmse_TTAT (&a);
++  if (cait.flags.mpu_region)
++    a++;
++
++  cait = cmse_TTAT_fptr (&bar);
++  if (cait.flags.mpu_region)
++    a+= bar ();
++
++  p = (char *) cmse_check_address_range ((void *) p, sizeof (char), 0);
++  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
++					 CMSE_MPU_UNPRIV);
++  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
++					 CMSE_MPU_READWRITE);
++  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
++					 CMSE_MPU_UNPRIV | CMSE_MPU_READ);
++  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
++					 CMSE_AU_NONSECURE
++					 | CMSE_MPU_NONSECURE);
++  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
++					 CMSE_NONSECURE | CMSE_MPU_UNPRIV);
++
++  p = (char *) cmse_check_pointed_object (p, CMSE_NONSECURE | CMSE_MPU_UNPRIV);
++
++  return a;
++}
++/* { dg-final { scan-assembler-times "\ttt " 2 } } */
++/* { dg-final { scan-assembler-times "ttt " 2 } } */
++/* { dg-final { scan-assembler-times "tta " 2 } } */
++/* { dg-final { scan-assembler-times "ttat " 2 } } */
++/* { dg-final { scan-assembler-times "bl.cmse_check_address_range" 7 } } */
++/* { dg-final { scan-assembler-not "cmse_check_pointed_object" } } */
++
++int __attribute__ ((cmse_nonsecure_entry))
++baz (void)
++{
++  return cmse_nonsecure_caller ();
++}
++
++typedef int __attribute__ ((cmse_nonsecure_call)) (int_nsfunc_t) (void);
++
++int default_callback (void)
++{
++  return 0;
++}
++
++int_nsfunc_t * fp = (int_nsfunc_t *) default_callback;
++
++void __attribute__ ((cmse_nonsecure_entry))
++qux (int_nsfunc_t * callback)
++{
++  fp = cmse_nsfptr_create (callback);
++}
++
++int call_callback (void)
++{
++  if (cmse_is_nsfptr (fp))
++      return fp ();
++  else
++    return default_callback ();
++}
++/* { dg-final { scan-assembler "baz:" } } */
++/* { dg-final { scan-assembler "__acle_se_baz:" } } */
++/* { dg-final { scan-assembler "qux:" } } */
++/* { dg-final { scan-assembler "__acle_se_qux:" } } */
++/* { dg-final { scan-assembler-not "\tcmse_nonsecure_caller" } } */
++/* { dg-final { scan-rtl-dump "and.*reg.*const_int 1" expand } } */
++/* { dg-final { scan-assembler "bic" } } */
++/* { dg-final { scan-assembler "push\t\{r4, r5, r6" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq" } } */
++/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 1 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-10.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" }  */
++
++void
++foo (void) {}
++
++/* { dg-final { scan-assembler-not "bxns" } } */
++/* { dg-final { scan-assembler "foo:" } } */
++/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-12.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" }  */
++#include <arm_cmse.h>
++
++char *
++foo (char * p)
++{
++  if (!cmse_is_nsfptr (p))
++    return cmse_nsfptr_create (p);
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler-not "cmse_is_nsfptr" } } */
++/* { dg-final { scan-assembler-not "cmse_nsfptr_create" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-14.c
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
++
++int foo (void)
++{
++  return bar ();
++}
++
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++/* { dg-final { scan-assembler-not "b\[^ y\n\]*\\s+bar" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-15.c
+@@ -0,0 +1,72 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++int __attribute__ ((cmse_nonsecure_call)) (*ns_foo) (void);
++int (*s_bar) (void);
++int __attribute__ ((cmse_nonsecure_call)) (**ns_foo2) (void);
++int (**s_bar2) (void);
++
++typedef int __attribute__ ((cmse_nonsecure_call)) ns_foo_t (void);
++typedef int s_bar_t (void);
++typedef int __attribute__ ((cmse_nonsecure_call)) (* ns_foo_ptr) (void);
++typedef int (*s_bar_ptr) (void);
++
++int nonsecure0 (ns_foo_t * ns_foo_p)
++{
++  return ns_foo_p ();
++}
++
++int nonsecure1 (ns_foo_t ** ns_foo_p)
++{
++  return (*ns_foo_p) ();
++}
++
++int nonsecure2 (ns_foo_ptr ns_foo_p)
++{
++  return ns_foo_p ();
++}
++int nonsecure3 (ns_foo_ptr * ns_foo_p)
++{
++  return (*ns_foo_p) ();
++}
++
++int secure0 (s_bar_t * s_bar_p)
++{
++  return s_bar_p ();
++}
++
++int secure1 (s_bar_t ** s_bar_p)
++{
++  return (*s_bar_p) ();
++}
++
++int secure2 (s_bar_ptr s_bar_p)
++{
++  return s_bar_p ();
++}
++
++int secure3 (s_bar_ptr * s_bar_p)
++{
++  return (*s_bar_p) ();
++}
++
++int nonsecure4 (void)
++{
++  return ns_foo ();
++}
++
++int nonsecure5 (void)
++{
++  return (*ns_foo2) ();
++}
++
++int secure4 (void)
++{
++  return s_bar ();
++}
++
++int secure5 (void)
++{
++  return (*s_bar2) ();
++}
++/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 6 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" }  */
++
++struct span {
++  int a, b;
++};
++struct span2 {
++  float a, b, c, d;
++};
++
++union test_union
++{
++  long long a;
++  int b;
++  struct span2 c;
++} test_union;
++
++void __attribute__ ((cmse_nonsecure_entry))
++foo (long long a, int b, long long c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
++
++void __attribute__ ((cmse_nonsecure_entry))
++bar (long long a, int b, struct span c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
++
++void __attribute__ ((cmse_nonsecure_entry))
++baz (int a, ...) {} /* { dg-error "not available to functions with variable number of arguments" } */
++
++struct span __attribute__ ((cmse_nonsecure_entry))
++qux (void) { /* { dg-error "not available to functions that return value on the stack" } */
++  struct span ret = {0, 0};
++  return ret;
++}
++
++void __attribute__ ((cmse_nonsecure_entry))
++norf (struct span2 a) {}
++
++void __attribute__ ((cmse_nonsecure_entry))
++foo2 (long long a, int b, union test_union c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
++
++typedef void __attribute__ ((cmse_nonsecure_call)) bar2 (long long a, int b, long long c); /* { dg-error "not available to functions with arguments passed on the stack" } */
++
++typedef void __attribute__ ((cmse_nonsecure_call)) baz2 (long long a, int b, struct span c); /* { dg-error "not available to functions with arguments passed on the stack" } */
++
++typedef struct span __attribute__ ((cmse_nonsecure_call)) qux2 (void); /* { dg-error "not available to functions that return value on the stack" } */
++
++typedef void __attribute__ ((cmse_nonsecure_call)) norf2 (int a, ...); /* { dg-error "not available to functions with variable number of arguments" } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" }  */
++
++struct span {
++  int a, b;
++};
++
++extern int qux (void);
++
++void __attribute__ ((cmse_nonsecure_entry))
++foo (void) {}
++
++static void __attribute__ ((cmse_nonsecure_entry))
++bar (void) {} /* { dg-warning "has no effect on functions with static linkage" } */
++
++int __attribute__ ((cmse_nonsecure_entry))
++baz (void)
++{
++  return qux ();
++}
++
++void __attribute__ ((cmse_nonsecure_call))
++quux (void) {} /* { dg-warning "attribute only applies to base type of a function pointer" } */
++
++int __attribute__ ((cmse_nonsecure_call)) norf; /* { dg-warning "attribute only applies to base type of a function pointer" } */
++
++/* { dg-final { scan-assembler-times "bxns" 2 } } */
++/* { dg-final { scan-assembler "foo:" } } */
++/* { dg-final { scan-assembler "__acle_se_foo:" } } */
++/* { dg-final { scan-assembler-not "__acle_se_bar:" } } */
++/* { dg-final { scan-assembler "baz:" } } */
++/* { dg-final { scan-assembler "__acle_se_baz:" } } */
++/* { dg-final { scan-assembler-not "__acle_se_quux:" } } */
++/* { dg-final { scan-assembler-not "__acle_se_norf:" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-skip-if "Testing exclusion of -mcmse" { arm-*-* } { "-mcmse" } { "" } }  */
++
++
++void __attribute__ ((cmse_nonsecure_call)) (*bar) (int); /* { dg-warning "attribute ignored without -mcmse option" } */
++typedef void __attribute__ ((cmse_nonsecure_call)) baz (int); /* { dg-warning "attribute ignored without -mcmse option" } */
++
++int __attribute__ ((cmse_nonsecure_entry))
++foo (int a, baz b)
++{ /* { dg-warning "attribute ignored without -mcmse option" } */
++  bar (a);
++  b (a);
++  return a + 1;
++}
++
++/* { dg-final { scan-assembler-not "bxns" } } */
++/* { dg-final { scan-assembler-not "blxns" } } */
++/* { dg-final { scan-assembler-not "bl\t__gnu_cmse_nonsecure_call" } } */
++/* { dg-final { scan-assembler "foo:" } } */
++/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse.exp
+@@ -0,0 +1,72 @@
++#   Copyright (C) 1997-2016 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++# GCC testsuite for ARMv8-M Security Extensions using the `dg.exp' driver.
++
++# Load support procs.
++load_lib gcc-dg.exp
++
++# Exit immediately if the target does not support -mcmse.
++if ![check_effective_target_arm_cmse_ok] then {
++    return
++}
++
++# If a testcase doesn't have special options, use these.
++global DEFAULT_CFLAGS
++if ![info exists DEFAULT_CFLAGS] then {
++    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
++}
++
++# Initialize `dg'.
++dg-init
++
++set saved-dg-do-what-default ${dg-do-what-default}
++set dg-do-what-default "assemble"
++
++set saved-lto_torture_options ${LTO_TORTURE_OPTIONS}
++set LTO_TORTURE_OPTIONS ""
++
++# These are for both baseline and mainline.
++gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] \
++	"" $DEFAULT_CFLAGS
++
++if {[check_effective_target_arm_arch_v8m_base_ok]} then {
++    # Baseline only
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/baseline/*.c]] \
++	    "" $DEFAULT_CFLAGS
++}
++
++if {[check_effective_target_arm_arch_v8m_main_ok]} then {
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/*.c]] \
++	    "" $DEFAULT_CFLAGS
++    # Mainline -mfloat-abi=soft
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/soft/*.c]] \
++	    "-mfloat-abi=soft" $DEFAULT_CFLAGS
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp/*.c]] \
++	    "" $DEFAULT_CFLAGS
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp-sp/*.c]] \
++	    "" $DEFAULT_CFLAGS
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard/*.c]] \
++	    "" $DEFAULT_CFLAGS
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard-sp/*.c]] \
++	    "" $DEFAULT_CFLAGS
++}
++
++set LTO_TORTURE_OPTIONS ${saved-lto_torture_options}
++set dg-do-what-default ${saved-dg-do-what-default}
++
++# All done.
++dg-finish
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c
+@@ -0,0 +1,55 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char a;
++  unsigned int b:5;
++  unsigned int c:11, :0, d:8;
++  struct { unsigned int ee:2; } e;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++extern void foo (test_st st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++  r.values.v4 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tip, #65535" } } */
++/* { dg-final { scan-assembler "movt\tip, 255" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "mov\tip, #255" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "mov\tip, #3" } } */
++/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c
+@@ -0,0 +1,51 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned short  b :5;
++  unsigned char	  c;
++  unsigned short  d :11;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tip, #8191" } } */
++/* { dg-final { scan-assembler "movt\tip, 255" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #2047" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c
+@@ -0,0 +1,61 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char a;
++  unsigned int	b : 3;
++  unsigned int	c : 14;
++  unsigned int	d : 1;
++  struct {
++      unsigned int    ee  : 2;
++      unsigned short  ff  : 15;
++  } e;
++  unsigned char	g : 1;
++  unsigned char	  : 4;
++  unsigned char	h : 3;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++  r.values.v4 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tip, #65535" } } */
++/* { dg-final { scan-assembler "movt\tip, 1023" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "mov\tip, #3" } } */
++/* { dg-final { scan-assembler "movt\tip, 32767" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "mov\tip, #255" } } */
++/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c
+@@ -0,0 +1,52 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned short  b :5;
++  unsigned char	  c;
++  unsigned short  d :11;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++
++/* { dg-final { scan-assembler "movw\tip, #8191" } } */
++/* { dg-final { scan-assembler "movt\tip, 255" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #2047" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c
+@@ -0,0 +1,55 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned int	    :0;
++  unsigned int	  b :1;
++  unsigned short    :0;
++  unsigned short  c;
++  unsigned int	    :0;
++  unsigned int	  d :21;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "mov\tip, #255" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "mov\tip, #1" } } */
++/* { dg-final { scan-assembler "movt\tip, 65535" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #65535" } } */
++/* { dg-final { scan-assembler "movt\tip, 31" } } */
++/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c
+@@ -0,0 +1,54 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  char a:3;
++} test_st3;
++
++typedef struct
++{
++  char a:3;
++} test_st2;
++
++typedef struct
++{
++  test_st2 st2;
++  test_st3 st3;
++} test_st;
++
++typedef union
++{
++  test_st st;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st;
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
++
++int
++main (void)
++{
++  read_st r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++
++  f (r.st);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tip, #1799" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c
+@@ -0,0 +1,94 @@
++/* { dg-do compile } */
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned short a :11;
++} test_st_4;
++
++typedef union
++{
++  char	      a;
++  test_st_4 st4;
++}test_un_2;
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned int	    :0;
++  unsigned int	  b :1;
++  unsigned short    :0;
++  unsigned short  c;
++  unsigned int	    :0;
++  unsigned int	  d :21;
++} test_st_3;
++
++typedef struct
++{
++  unsigned char	  a :3;
++  unsigned int	  b :13;
++  test_un_2	  un2;
++} test_st_2;
++
++typedef union
++{
++  test_st_2 st2;
++  test_st_3 st3;
++}test_un_1;
++
++typedef struct
++{
++  unsigned char	  a :2;
++  unsigned char	    :0;
++  unsigned short  b :5;
++  unsigned char	    :0;
++  unsigned char	  c :4;
++  test_un_1	  un1;
++} test_st_1;
++
++typedef union
++{
++  test_st_1 st1;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_st_1;
++
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1);
++
++int
++main (void)
++{
++  read_st_1 r;
++  foo_ns f;
++
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
++  r.values.v4 = 0xFFFFFFFF;
++
++  f (r.st1);
++  return 0;
++}
++
++/* { dg-final { scan-assembler "movw\tip, #7939" } } */
++/* { dg-final { scan-assembler "movt\tip, 15" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #65535" } } */
++/* { dg-final { scan-assembler "movt\tip, 2047" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "mov\tip, #1" } } */
++/* { dg-final { scan-assembler "movt\tip, 65535" } } */
++/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #65535" } } */
++/* { dg-final { scan-assembler "movt\tip, 31" } } */
++/* { dg-final { scan-assembler "and\tr3, r3, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c
+@@ -0,0 +1,43 @@
 +/* { dg-do compile } */
 +/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
 +/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
 +
-+#include "../aarch64/atomic-op-consume.x"
 +
-+/* Scan for ldaex is a PR59448 consume workaround.  */
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-consume.c
-+++ b/src//dev/null
-@@ -1,11 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-consume.x"
--
--/* Scan for ldaex is a PR59448 consume workaround.  */
--/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
 +
-+#include "../aarch64/atomic-op-int.x"
++int
++foo (int a)
++{
++  return bar (3.0f, 2.0) + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-2.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c
+@@ -0,0 +1,45 @@
 +/* { dg-do compile } */
 +/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
 +/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-int.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-int.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-int.x"
--
--/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
++
++extern float bar (void);
++
++float __attribute__ ((cmse_nonsecure_entry))
++foo (void)
++{
++  return bar ();
++}
++/* { dg-final { scan-assembler "mov\tr0, lr" } } */
++/* { dg-final { scan-assembler "mov\tr1, lr" } } */
++/* { dg-final { scan-assembler "mov\tr2, lr" } } */
++/* { dg-final { scan-assembler "mov\tr3, lr" } } */
++/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
++/* { dg-final { scan-assembler "push\t{r4}" } } */
++/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
++/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
++/* { dg-final { scan-assembler "and\tip, r4" } } */
++/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
++/* { dg-final { scan-assembler "pop\t{r4}" } } */
++/* { dg-final { scan-assembler "mov\tip, lr" } } */
++/* { dg-final { scan-assembler "bxns" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-1.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c
+@@ -0,0 +1,42 @@
 +/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
 +
-+#include "../aarch64/atomic-op-relaxed.x"
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
++
++int
++foo (int a)
++{
++  return bar () + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts0, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-2.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c
+@@ -0,0 +1,41 @@
 +/* { dg-do compile } */
 +/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
 +/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
 +
-+#include "../aarch64/atomic-op-relaxed.x"
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
 +
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-relaxed.x"
--
--/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
++int
++foo (int a)
++{
++  return bar (2.0) + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts1, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-1.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c
+@@ -0,0 +1,38 @@
 +/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
 +
-+#include "../aarch64/atomic-op-release.x"
 +
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
++
++int
++foo (int a)
++{
++  return bar (3.0f, 2.0) + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.64\td1, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */
++/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-2.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c
+@@ -0,0 +1,38 @@
 +/* { dg-do compile } */
 +/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
 +/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-release.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-release.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-release.x"
--
--/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
++
++extern float bar (void);
++
++float __attribute__ ((cmse_nonsecure_entry))
++foo (void)
++{
++  return bar ();
++}
++/* { dg-final { scan-assembler "mov\tr0, lr" } } */
++/* { dg-final { scan-assembler "mov\tr1, lr" } } */
++/* { dg-final { scan-assembler "mov\tr2, lr" } } */
++/* { dg-final { scan-assembler "mov\tr3, lr" } } */
++/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
++/* { dg-final { scan-assembler "push\t{r4}" } } */
++/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
++/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
++/* { dg-final { scan-assembler "and\tip, r4" } } */
++/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
++/* { dg-final { scan-assembler "pop\t{r4}" } } */
++/* { dg-final { scan-assembler "mov\tip, lr" } } */
++/* { dg-final { scan-assembler "bxns" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-1.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c
+@@ -0,0 +1,34 @@
 +/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
 +
-+#include "../aarch64/atomic-op-seq_cst.x"
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
++
++int
++foo (int a)
++{
++  return bar () + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "vldr\.64\td0, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-2.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c
+@@ -0,0 +1,33 @@
 +/* { dg-do compile } */
 +/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
 +/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
 +
-+#include "../aarch64/atomic-op-seq_cst.x"
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
 +
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-seq_cst.x"
--
--/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
++int
++foo (int a)
++{
++  return bar (2.0) + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
++/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-1.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c
+@@ -0,0 +1,27 @@
 +/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
++/* { dg-options "-mcmse -mfloat-abi=soft" }  */
 +
-+#include "../aarch64/atomic-op-short.x"
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
++
++int
++foo (int a)
++{
++  return bar (1.0f, 2.0) + a + 1;
++}
++
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vmov" } } */
++/* { dg-final { scan-assembler-not "vmsr" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-2.c
-@@ -0,0 +1,10 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c
+@@ -0,0 +1,24 @@
 +/* { dg-do compile } */
 +/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
 +/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
++/* { dg-options "-mcmse -mfloat-abi=soft" }  */
 +
-+#include "../aarch64/atomic-op-short.x"
++extern float bar (void);
 +
-+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-short.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-short.x"
--
--/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/attr-fp16-arith-1.c
-@@ -0,0 +1,58 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar } */
++float __attribute__ ((cmse_nonsecure_entry))
++foo (void)
++{
++  return bar ();
++}
 +
-+/* Reset fpu to a value compatible with the next pragmas.  */
-+#pragma GCC target ("fpu=vfp")
++/* { dg-final { scan-assembler "mov\tr1, lr" } } */
++/* { dg-final { scan-assembler "mov\tr2, lr" } } */
++/* { dg-final { scan-assembler "mov\tr3, lr" } } */
++/* { dg-final { scan-assembler "mov\tip, lr" } } */
++/* { dg-final { scan-assembler-not "vmov" } } */
++/* { dg-final { scan-assembler-not "vmsr" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
++/* { dg-final { scan-assembler "bxns" } } */
 +
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=fp-armv8")
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
++/* { dg-options "-mcmse -mfloat-abi=soft" }  */
 +
-+#ifndef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-+#error __ARM_FEATURE_FP16_SCALAR_ARITHMETIC not defined.
-+#endif
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
 +
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=neon-fp-armv8")
++int
++foo (int a)
++{
++  return bar () + a + 1;
++}
 +
-+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-+#error __ARM_FEATURE_FP16_VECTOR_ARITHMETIC not defined.
-+#endif
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vmov" } } */
++/* { dg-final { scan-assembler-not "vmsr" } } */
 +
-+#ifndef __ARM_NEON
-+#error __ARM_NEON not defined.
-+#endif
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
-+#error Invalid value for __ARM_FP
-+#endif
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
++/* { dg-options "-mcmse -mfloat-abi=soft" }  */
 +
-+#include "arm_neon.h"
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
 +
-+float16_t
-+foo (float16x4_t b)
++int
++foo (int a)
 +{
-+  float16x4_t a = {2.0, 3.0, 4.0, 5.0};
-+  float16x4_t res = vadd_f16 (a, b);
-+
-+  return res[0];
++  return bar (2.0) + a + 1;
 +}
 +
-+/* { dg-final { scan-assembler "vadd\\.f16\td\[0-9\]+, d\[0-9\]+" } } */
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler-not "vmov" } } */
++/* { dg-final { scan-assembler-not "vmsr" } } */
 +
-+#pragma GCC pop_options
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
++
++extern float bar (void);
++
++float __attribute__ ((cmse_nonsecure_entry))
++foo (void)
++{
++  return bar ();
++}
++/* { dg-final { scan-assembler "__acle_se_foo:" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */
++/* { dg-final { scan-assembler "mov\tr1, lr" } } */
++/* { dg-final { scan-assembler "mov\tr2, lr" } } */
++/* { dg-final { scan-assembler "mov\tr3, lr" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts0, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
++/* { dg-final { scan-assembler "push\t{r4}" } } */
++/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
++/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
++/* { dg-final { scan-assembler "and\tip, r4" } } */
++/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
++/* { dg-final { scan-assembler "pop\t{r4}" } } */
++/* { dg-final { scan-assembler "mov\tip, lr" } } */
++/* { dg-final { scan-assembler "bxns" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
 +
-+/* Check that the FP version is correctly reset to mfpu=fp-armv8.  */
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
 +
-+#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
-+#error __ARM_FP should record FP16 support.
-+#endif
++int
++foo (int a)
++{
++  return bar () + a + 1;
++}
 +
-+#pragma GCC pop_options
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
 +
-+/* Check that the FP version is correctly reset to mfpu=vfp.  */
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+#if !defined (__ARM_FP) || (__ARM_FP & 0x2)
-+#error Unexpected value for __ARM_FP.
-+#endif
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddl.c
-@@ -0,0 +1,17 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c
+@@ -0,0 +1,25 @@
 +/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+long overflow_add (long x, long y)
-+{
-+  long r;
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
 +
-+  int ovr = __builtin_saddl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
 +
-+  return r;
++int
++foo (int a)
++{
++  return bar (2.0) + a + 1;
 +}
 +
-+/* { dg-final { scan-assembler "adds" } } */
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddll.c
-@@ -0,0 +1,18 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c
+@@ -0,0 +1,25 @@
 +/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
 +
-+long long overflow_add (long long x, long long y)
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
++
++int
++foo (int a)
 +{
-+  long long r;
++  return bar (1.0f, 2.0) + a + 1;
++}
 +
-+  int ovr = __builtin_saddll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "\n\tmov\tr1, r4" } } */
++/* { dg-final { scan-assembler-not "\n\tmov\tr2, r4\n\tmov\tr3, r4" } } */
 +
-+  return r;
-+}
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+/* { dg-final { scan-assembler "adds" } } */
-+/* { dg-final { scan-assembler "adcs" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubl.c
-@@ -0,0 +1,17 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c
+@@ -0,0 +1,38 @@
 +/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
++
++extern float bar (void);
++
++float __attribute__ ((cmse_nonsecure_entry))
++foo (void)
++{
++  return bar ();
++}
++/* { dg-final { scan-assembler "__acle_se_foo:" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */
++/* { dg-final { scan-assembler "mov\tr1, lr" } } */
++/* { dg-final { scan-assembler "mov\tr2, lr" } } */
++/* { dg-final { scan-assembler "mov\tr3, lr" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td0, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */
++/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
++/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
++/* { dg-final { scan-assembler "push\t{r4}" } } */
++/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
++/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
++/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
++/* { dg-final { scan-assembler "and\tip, r4" } } */
++/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
++/* { dg-final { scan-assembler "pop\t{r4}" } } */
++/* { dg-final { scan-assembler "mov\tip, lr" } } */
++/* { dg-final { scan-assembler "bxns" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
 +
-+long overflow_sub (long x, long y)
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
++
++int
++foo (int a)
 +{
-+  long r;
++  return bar () + a + 1;
++}
 +
-+  int ovr = __builtin_ssubl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
 +
-+  return r;
-+}
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 +
-+/* { dg-final { scan-assembler "subs" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubll.c
-@@ -0,0 +1,18 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c
+@@ -0,0 +1,25 @@
 +/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+long long overflow_sub (long long x, long long y)
-+{
-+  long long r;
++/* { dg-require-effective-target arm_arch_v8m_main_ok } */
++/* { dg-add-options arm_arch_v8m_main } */
++/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
++/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
++/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
 +
-+  int ovr = __builtin_ssubll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
 +
-+  return r;
++int
++foo (int a)
++{
++  return bar (2.0) + a + 1;
 +}
 +
-+/* { dg-final { scan-assembler "subs" } } */
-+/* { dg-final { scan-assembler "sbcs" } } */
++/* Checks for saving and clearing prior to function call.  */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
++/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++
++/* Now we check that we use the correct intrinsic to call.  */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddl.c
-@@ -0,0 +1,17 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c
+@@ -0,0 +1,69 @@
 +/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
++/* { dg-options "-mcmse" } */
 +
-+unsigned long overflow_add (unsigned long x, unsigned long y)
++typedef struct
 +{
-+  unsigned long r;
++  unsigned char	  a :2;
++  unsigned char	    :0;
++  unsigned short  b :5;
++  unsigned char	    :0;
++  unsigned short  c :3;
++  unsigned char	    :0;
++  unsigned int	  d :9;
++} test_st_1;
 +
-+  int ovr = __builtin_uaddl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++typedef struct
++{
++  unsigned short  a :7;
++  unsigned char	    :0;
++  unsigned char	  b :1;
++  unsigned char	    :0;
++  unsigned short  c :6;
++} test_st_2;
 +
-+  return r;
-+}
++typedef union
++{
++  test_st_1 st_1;
++  test_st_2 st_2;
++}test_un;
 +
-+/* { dg-final { scan-assembler "adds" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddll.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
++typedef union
++{
++  test_un un;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_un;
 +
-+unsigned long long overflow_add (unsigned long long x, unsigned long long y)
++
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
++
++int
++main (void)
 +{
-+  unsigned long long r;
++  read_un r;
++  foo_ns f;
 +
-+  int ovr = __builtin_uaddll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
 +
-+  return r;
++  f (r.un);
++  return 0;
 +}
 +
-+/* { dg-final { scan-assembler "adds" } } */
-+/* { dg-final { scan-assembler "adcs" } } */
++/* { dg-final { scan-assembler "movw\tip, #8063" } } */
++/* { dg-final { scan-assembler "movt\tip, 63" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #511" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr2, r4" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
++
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubl.c
-@@ -0,0 +1,17 @@
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c
+@@ -0,0 +1,84 @@
 +/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
++/* { dg-options "-mcmse" } */
++
++typedef struct
++{
++  unsigned char	  a :2;
++  unsigned char	    :0;
++  unsigned short  b :5;
++  unsigned char	    :0;
++  unsigned short  c :3;
++  unsigned char	    :0;
++  unsigned int	  d :9;
++} test_st_1;
++
++typedef struct
++{
++  unsigned short  a :7;
++  unsigned char	    :0;
++  unsigned char	  b :1;
++  unsigned char	    :0;
++  unsigned short  c :6;
++} test_st_2;
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned int	    :0;
++  unsigned int	  b :1;
++  unsigned short    :0;
++  unsigned short  c;
++  unsigned int	    :0;
++  unsigned int	  d :21;
++} test_st_3;
 +
-+unsigned long overflow_sub (unsigned long x, unsigned long y)
++typedef union
 +{
-+  unsigned long r;
++  test_st_1 st_1;
++  test_st_2 st_2;
++  test_st_3 st_3;
++}test_un;
 +
-+  int ovr = __builtin_usubl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++typedef union
++{
++  test_un un;
++  struct
++    {
++      unsigned int v1;
++      unsigned int v2;
++      unsigned int v3;
++      unsigned int v4;
++    }values;
++} read_un;
 +
-+  return r;
-+}
 +
-+/* { dg-final { scan-assembler "subs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubll.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
++typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
 +
-+unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
++int
++main (void)
 +{
-+  unsigned long long r;
++  read_un r;
++  foo_ns f;
 +
-+  int ovr = __builtin_usubll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
++  f = (foo_ns) 0x200000;
++  r.values.v1 = 0xFFFFFFFF;
++  r.values.v2 = 0xFFFFFFFF;
++  r.values.v3 = 0xFFFFFFFF;
 +
-+  return r;
++  f (r.un);
++  return 0;
 +}
 +
-+/* { dg-final { scan-assembler "subs" } } */
-+/* { dg-final { scan-assembler "sbcs" } } */
++/* { dg-final { scan-assembler "movw\tip, #8191" } } */
++/* { dg-final { scan-assembler "movt\tip, 63" } } */
++/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #511" } } */
++/* { dg-final { scan-assembler "movt\tip, 65535" } } */
++/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
++/* { dg-final { scan-assembler "movw\tip, #65535" } } */
++/* { dg-final { scan-assembler "movt\tip, 31" } } */
++/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
++/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
++/* { dg-final { scan-assembler "mov\tr3, r4" } } */
++/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
 --- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cbz.c
-@@ -0,0 +1,12 @@
-+/* { dg-do compile {target { arm_thumb2 || arm_thumb1_cbz_ok } } } */
-+/* { dg-options "-O2" } */
++++ b/src/gcc/testsuite/gcc.target/arm/cmse/struct-1.c
+@@ -0,0 +1,33 @@
++/* { dg-do run } */
++/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
++
++typedef struct
++{
++  unsigned char	  a;
++  unsigned short  b;
++} test_st;
++
++test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
++{
++  test_st t;
++  t.a = 255u;
++  t.b = 32767u;
++  return t;
++}
 +
 +int
-+foo (int a, int *b)
++main (void)
 +{
-+  if (a)
-+    *b = 1;
++  test_st t;
++  t = foo ();
++  if (t.a != 255u || t.b != 32767u)
++    __builtin_abort ();
 +  return 0;
 +}
 +
-+/* { dg-final { scan-assembler-times "cbz\\tr\\d" 1 } } */
++/* { dg-final { scan-assembler "movs\tr1, #255" } } */
++/* { dg-final { scan-assembler "movt\tr1, 65535" } } */
++/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
++/* { dg-final { scan-assembler "bxns" } } */
++
++
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/arm/data-rel-1.c
 @@ -0,0 +1,12 @@
@@ -81174,6 +91385,54 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
 +/* { dg-final { scan-assembler-times {mov\tr1, r0} 1 } }  */
 +/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-3.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile }  */
++/* { dg-require-effective-target arm_hard_vfp_ok }  */
++/* { dg-require-effective-target arm_fp16_ok } */
++/* { dg-options "-O2" }  */
++/* { dg-add-options arm_fp16_alternative } */
++
++/* Test __fp16 arguments and return value in registers (hard-float).  */
++
++void
++swap (__fp16, __fp16);
++
++__fp16
++F (__fp16 a, __fp16 b, __fp16 c)
++{
++  swap (b, a);
++  return c;
++}
++
++/* { dg-final { scan-assembler-times {vmov\tr[0-9]+, s[0-2]} 2 } }  */
++/* { dg-final { scan-assembler-times {vmov.f32\ts1, s0} 1 } }  */
++/* { dg-final { scan-assembler-times {vmov\ts0, r[0-9]+} 2 } }  */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile }  */
++/* { dg-require-effective-target arm_fp16_ok } */
++/* { dg-options "-mfloat-abi=softfp -O2" }  */
++/* { dg-add-options arm_fp16_alternative } */
++/* { dg-skip-if "incompatible float-abi" { arm*-*-* } { "-mfloat-abi=hard" } } */
++
++/* Test __fp16 arguments and return value in registers (softfp).  */
++
++void
++swap (__fp16, __fp16);
++
++__fp16
++F (__fp16 a, __fp16 b, __fp16 c)
++{
++  swap (b, a);
++  return c;
++}
++
++/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
++/* { dg-final { scan-assembler-times {mov\tr1, r0} 1 } }  */
++/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
 --- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-1.c
 +++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-1.c
 @@ -1,4 +1,5 @@
@@ -126167,6 +136426,38 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 -
 -/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 --- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-1.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++/* { dg-skip-if "-marm/-mthumb/-march/-mcpu given" { *-*-*} { "-marm" "-mthumb" "-march=*" "-mcpu=*" } } */
++/* { dg-options "-march=armv6-m" } */
++
++/* Check that -mthumb is not needed when compiling for a Thumb-only target.  */
++
++int foo;
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-2.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++/* { dg-skip-if "-marm/-mthumb/-march/-mcpu given" { *-*-*} { "-marm" "-mthumb" "-march=*" "-mcpu=*" } } */
++/* { dg-options "-mcpu=cortex-m4" } */
++
++/* Check that -mthumb is not needed when compiling for a Thumb-only target.  */
++
++int foo;
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-3.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_cortex_m } */
++/* { dg-skip-if "-mthumb given" { *-*-*} { "-mthumb" } } */
++/* { dg-options "-marm" } */
++/* { dg-error "target CPU does not support ARM mode" "missing error with -marm on Thumb-only targets" { target *-*-*} 0 } */
++
++/* Check that -marm gives an error when compiling for a Thumb-only target.  */
++
++int foo;
+--- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/arm/polytypes.c
 @@ -0,0 +1,48 @@
 +/* Check that NEON polynomial vector types are suitably incompatible with
@@ -126412,6 +136703,655 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +/* {dg-final { scan-assembler-times {vmov\ts[0-9]+,r[0-9]+} 2 }} */
 +/* {dg-final { scan-assembler-times {sxth\tr[0-9]+,r[0-9]+} 2 }} */
 --- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/simd/vmaxnm_f32_1.c
+@@ -0,0 +1,159 @@
++/* Test the `vmaxnmf32' ARM Neon intrinsic.  */
++
++/* { dg-do run } */
++/* { dg-require-effective-target arm_v8_neon_hw } */
++/* { dg-options "-save-temps -O3 -march=armv8-a" } */
++/* { dg-add-options arm_v8_neon } */
++
++#include "arm_neon.h"
++
++extern void abort ();
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__regular_input1 ()
++{
++  float32_t a1[] = {1,2};
++  float32_t b1[] = {3,4};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != b1[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__regular_input2 ()
++{
++  float32_t a1[] = {3,2};
++  float32_t b1[] = {1,4};
++  float32_t e[] = {3,4};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__quiet_NaN_one_arg ()
++{
++  /* When given a quiet NaN, vmaxnm returns the other operand.
++     In this test case we have NaNs in only one operand.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {1,2};
++  float32_t b1[] = {n,n};
++  float32_t e[] = {1,2};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__quiet_NaN_both_args ()
++{
++  /* When given a quiet NaN, vmaxnm returns the other operand.
++     In this test case we have NaNs in both operands.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,2};
++  float32_t b1[] = {1,n};
++  float32_t e[] = {1,2};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__zero_both_args ()
++{
++  /* For 0 and -0, vmaxnm returns 0.  Since 0 == -0, check sign bit.  */
++  float32_t a1[] = {0.0, 0.0};
++  float32_t b1[] = {-0.0, -0.0};
++  float32_t e[] = {0.0, 0.0};
++
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++
++  float32_t actual1[2];
++  vst1_f32 (actual1, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) != 0)
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__inf_both_args ()
++{
++  /* The max of inf and inf is inf.  The max of -inf and -inf is -inf.  */
++  float32_t inf = __builtin_huge_valf ();
++  float32_t a1[] = {inf, -inf};
++  float32_t b1[] = {inf, -inf};
++  float32_t e[] = {inf, -inf};
++
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++
++  float32_t actual1[2];
++  vst1_f32 (actual1, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual1[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnm_f32__two_quiet_NaNs_both_args ()
++{
++  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
++     not even another NaN, use __builtin_isnan () to check.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,n};
++  float32_t b1[] = {n,n};
++  float32_t e[] = {n,n};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vmaxnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (!__builtin_isnan (actual[i]))
++      abort ();
++}
++
++int
++main ()
++{
++  test_vmaxnm_f32__regular_input1 ();
++  test_vmaxnm_f32__regular_input2 ();
++  test_vmaxnm_f32__quiet_NaN_one_arg ();
++  test_vmaxnm_f32__quiet_NaN_both_args ();
++  test_vmaxnm_f32__zero_both_args ();
++  test_vmaxnm_f32__inf_both_args ();
++  test_vmaxnm_f32__two_quiet_NaNs_both_args ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler-times "vmaxnm\.f32\t\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 7 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/simd/vmaxnmq_f32_1.c
+@@ -0,0 +1,160 @@
++/* Test the `vmaxnmqf32' ARM Neon intrinsic.  */
++
++/* { dg-do run } */
++/* { dg-require-effective-target arm_v8_neon_hw } */
++/* { dg-options "-save-temps -O3 -march=armv8-a" } */
++/* { dg-add-options arm_v8_neon } */
++
++#include "arm_neon.h"
++
++extern void abort ();
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__regular_input1 ()
++{
++  float32_t a1[] = {1,2,5,6};
++  float32_t b1[] = {3,4,7,8};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != b1[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__regular_input2 ()
++{
++  float32_t a1[] = {3,2,7,6};
++  float32_t b1[] = {1,4,5,8};
++  float32_t e[] = {3,4,7,8};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__quiet_NaN_one_arg ()
++{
++  /* When given a quiet NaN, vmaxnmq returns the other operand.
++     In this test case we have NaNs in only one operand.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {1,2,3,4};
++  float32_t b1[] = {n,n,n,n};
++  float32_t e[] = {1,2,3,4};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__quiet_NaN_both_args ()
++{
++  /* When given a quiet NaN, vmaxnmq returns the other operand.
++     In this test case we have NaNs in both operands.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,2,n,4};
++  float32_t b1[] = {1,n,3,n};
++  float32_t e[] = {1,2,3,4};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__zero_both_args ()
++{
++  /* For 0 and -0, vmaxnmq returns 0.  Since 0 == -0, check sign bit.  */
++  float32_t a1[] = {0.0, 0.0, -0.0, -0.0};
++  float32_t b1[] = {-0.0, -0.0, 0.0, 0.0};
++  float32_t e[] = {0.0, 0.0, 0.0, 0.0};
++
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++
++  float32_t actual1[4];
++  vst1q_f32 (actual1, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) != 0)
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__inf_both_args ()
++{
++  /* The max of inf and inf is inf.  The max of -inf and -inf is -inf.  */
++  float32_t inf = __builtin_huge_valf ();
++  float32_t a1[] = {inf, -inf, inf, inf};
++  float32_t b1[] = {inf, -inf, -inf, -inf};
++  float32_t e[] = {inf, -inf, inf, inf};
++
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++
++  float32_t actual1[4];
++  vst1q_f32 (actual1, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual1[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vmaxnmq_f32__two_quiet_NaNs_both_args ()
++{
++  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
++     not even another NaN, use __builtin_isnan () to check.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,n,n,n};
++  float32_t b1[] = {n,n,n,n};
++  float32_t e[] = {n,n};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vmaxnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (!__builtin_isnan (actual[i]))
++      abort ();
++}
++
++int
++main ()
++{
++  test_vmaxnmq_f32__regular_input1 ();
++  test_vmaxnmq_f32__regular_input2 ();
++  test_vmaxnmq_f32__quiet_NaN_one_arg ();
++  test_vmaxnmq_f32__quiet_NaN_both_args ();
++  test_vmaxnmq_f32__zero_both_args ();
++  test_vmaxnmq_f32__inf_both_args ();
++  test_vmaxnmq_f32__two_quiet_NaNs_both_args ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler-times "vmaxnm\.f32\t\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+\n" 7 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/simd/vminnm_f32_1.c
+@@ -0,0 +1,159 @@
++/* Test the `vminnmf32' ARM Neon intrinsic.  */
++
++/* { dg-do run } */
++/* { dg-require-effective-target arm_v8_neon_hw } */
++/* { dg-options "-save-temps -O3 -march=armv8-a" } */
++/* { dg-add-options arm_v8_neon } */
++
++#include "arm_neon.h"
++
++extern void abort ();
++
++void __attribute__ ((noinline))
++test_vminnm_f32__regular_input1 ()
++{
++  float32_t a1[] = {1,2};
++  float32_t b1[] = {3,4};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != a1[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnm_f32__regular_input2 ()
++{
++  float32_t a1[] = {3,2};
++  float32_t b1[] = {1,4};
++  float32_t e[] = {1,2};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnm_f32__quiet_NaN_one_arg ()
++{
++  /* When given a quiet NaN, vminnm returns the other operand.
++     In this test case we have NaNs in only one operand.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {1,2};
++  float32_t b1[] = {n,n};
++  float32_t e[] = {1,2};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnm_f32__quiet_NaN_both_args ()
++{
++  /* When given a quiet NaN, vminnm returns the other operand.
++     In this test case we have NaNs in both operands.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,2};
++  float32_t b1[] = {1,n};
++  float32_t e[] = {1,2};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnm_f32__zero_both_args ()
++{
++  /* For 0 and -0, vminnm returns -0.  Since 0 == -0, check sign bit.  */
++  float32_t a1[] = {0.0,0.0};
++  float32_t b1[] = {-0.0, -0.0};
++  float32_t e[] = {-0.0, -0.0};
++
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++
++  float32_t actual1[2];
++  vst1_f32 (actual1, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) == 0)
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnm_f32__inf_both_args ()
++{
++  /* The min of inf and inf is inf.  The min of -inf and -inf is -inf.  */
++  float32_t inf = __builtin_huge_valf ();
++  float32_t a1[] = {inf, -inf};
++  float32_t b1[] = {inf, -inf};
++  float32_t e[] = {inf, -inf};
++
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++
++  float32_t actual1[2];
++  vst1_f32 (actual1, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (actual1[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnm_f32__two_quiet_NaNs_both_args ()
++{
++  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
++     not even another NaN, use __builtin_isnan () to check.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,n};
++  float32_t b1[] = {n,n};
++  float32_t e[] = {n,n};
++  float32x2_t a = vld1_f32 (a1);
++  float32x2_t b = vld1_f32 (b1);
++  float32x2_t c = vminnm_f32 (a, b);
++  float32_t actual[2];
++  vst1_f32 (actual, c);
++
++  for (int i = 0; i < 2; ++i)
++    if (!__builtin_isnan (actual[i]))
++      abort ();
++}
++
++int
++main ()
++{
++  test_vminnm_f32__regular_input1 ();
++  test_vminnm_f32__regular_input2 ();
++  test_vminnm_f32__quiet_NaN_one_arg ();
++  test_vminnm_f32__quiet_NaN_both_args ();
++  test_vminnm_f32__zero_both_args ();
++  test_vminnm_f32__inf_both_args ();
++  test_vminnm_f32__two_quiet_NaNs_both_args ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler-times "vminnm\.f32\t\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 7 } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/simd/vminnmq_f32_1.c
+@@ -0,0 +1,159 @@
++/* Test the `vminnmqf32' ARM Neon intrinsic.  */
++
++/* { dg-do run } */
++/* { dg-require-effective-target arm_v8_neon_hw } */
++/* { dg-options "-save-temps -O3 -march=armv8-a" } */
++/* { dg-add-options arm_v8_neon } */
++
++#include "arm_neon.h"
++
++extern void abort ();
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__regular_input1 ()
++{
++  float32_t a1[] = {1,2,5,6};
++  float32_t b1[] = {3,4,7,8};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != a1[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__regular_input2 ()
++{
++  float32_t a1[] = {3,2,7,6};
++  float32_t b1[] = {1,4,5,8};
++  float32_t e[] = {1,2,5,6};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__quiet_NaN_one_arg ()
++{
++  /* When given a quiet NaN, vminnmq returns the other operand.
++     In this test case we have NaNs in only one operand.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {1,2,3,4};
++  float32_t b1[] = {n,n,n,n};
++  float32_t e[] = {1,2,3,4};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__quiet_NaN_both_args ()
++{
++  /* When given a quiet NaN, vminnmq returns the other operand.
++     In this test case we have NaNs in both operands.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,2,n,4};
++  float32_t b1[] = {1,n,3,n};
++  float32_t e[] = {1,2,3,4};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__zero_both_args ()
++{
++  /* For 0 and -0, vminnmq returns -0.  Since 0 == -0, check sign bit.  */
++  float32_t a1[] = {0.0, 0.0, -0.0, -0.0};
++  float32_t b1[] = {-0.0, -0.0, 0.0, 0.0};
++  float32_t e[] = {-0.0, -0.0, -0.0, -0.0};
++
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++
++  float32_t actual1[4];
++  vst1q_f32 (actual1, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) == 0)
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__inf_both_args ()
++{
++  /* The min of inf and inf is inf.  The min of -inf and -inf is -inf.  */
++  float32_t inf = __builtin_huge_valf ();
++  float32_t a1[] = {inf, -inf, inf, inf};
++  float32_t b1[] = {inf, -inf, -inf, -inf};
++  float32_t e[] = {inf, -inf, -inf, -inf};
++
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++
++  float32_t actual1[4];
++  vst1q_f32 (actual1, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (actual1[i] != e[i])
++      abort ();
++}
++
++void __attribute__ ((noinline))
++test_vminnmq_f32__two_quiet_NaNs_both_args ()
++{
++  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
++     not even another NaN, use __builtin_isnan () to check.  */
++  float32_t n = __builtin_nanf ("");
++  float32_t a1[] = {n,n,n,n};
++  float32_t b1[] = {n,n,n,n};
++  float32_t e[] = {n,n};
++  float32x4_t a = vld1q_f32 (a1);
++  float32x4_t b = vld1q_f32 (b1);
++  float32x4_t c = vminnmq_f32 (a, b);
++  float32_t actual[4];
++  vst1q_f32 (actual, c);
++
++  for (int i = 0; i < 4; ++i)
++    if (!__builtin_isnan (actual[i]))
++      abort ();
++}
++
++int
++main ()
++{
++  test_vminnmq_f32__regular_input1 ();
++  test_vminnmq_f32__regular_input2 ();
++  test_vminnmq_f32__quiet_NaN_one_arg ();
++  test_vminnmq_f32__quiet_NaN_both_args ();
++  test_vminnmq_f32__zero_both_args ();
++  test_vminnmq_f32__inf_both_args ();
++  test_vminnmq_f32__two_quiet_NaNs_both_args ();
++  return 0;
++}
++
++/* { dg-final { scan-assembler-times "vminnm\.f32\t\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+\n" 7 } } */
+--- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/arm/vect-vcvt.c
 @@ -0,0 +1,27 @@
 +/* { dg-do compile } */
@@ -126541,7 +137481,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	}
 --- a/src/gcc/testsuite/lib/target-supports.exp
 +++ b/src/gcc/testsuite/lib/target-supports.exp
-@@ -2938,6 +2938,28 @@ proc add_options_for_arm_v8_1a_neon { flags } {
+@@ -2936,6 +2936,28 @@ proc add_options_for_arm_v8_1a_neon { flags } {
      return "$flags $et_arm_v8_1a_neon_flags -march=armv8.1-a"
  }
  
@@ -126570,7 +137510,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  proc add_options_for_arm_crc { flags } {
      if { ! [check_effective_target_arm_crc_ok] } {
          return "$flags"
-@@ -3024,23 +3046,25 @@ proc check_effective_target_arm_crc_ok { } {
+@@ -3022,23 +3044,25 @@ proc check_effective_target_arm_crc_ok { } {
  
  proc check_effective_target_arm_neon_fp16_ok_nocache { } {
      global et_arm_neon_fp16_flags
@@ -126600,7 +137540,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		return 1
  	    }
  	}
-@@ -3077,6 +3101,65 @@ proc add_options_for_arm_neon_fp16 { flags } {
+@@ -3075,6 +3099,65 @@ proc add_options_for_arm_neon_fp16 { flags } {
      return "$flags $et_arm_neon_fp16_flags"
  }
  
@@ -126666,7 +137606,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  # Return 1 if this is an ARM target supporting -mfpu=neon-fp-armv8
  # -mfloat-abi=softfp or equivalent options.  Some multilibs may be
  # incompatible with these options.  Also set et_arm_v8_neon_flags to the
-@@ -3119,8 +3202,10 @@ proc check_effective_target_arm_v8_neon_ok { } {
+@@ -3117,8 +3200,10 @@ proc check_effective_target_arm_v8_neon_ok { } {
  
  proc check_effective_target_arm_neonv2_ok_nocache { } {
      global et_arm_neonv2_flags
@@ -126678,7 +137618,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-vfpv4" "-mfpu=neon-vfpv4 -mfloat-abi=softfp"} {
  	    if { [check_no_compiler_messages_nocache arm_neonv2_ok object {
  		#include "arm_neon.h"
-@@ -3129,8 +3214,8 @@ proc check_effective_target_arm_neonv2_ok_nocache { } {
+@@ -3127,8 +3212,8 @@ proc check_effective_target_arm_neonv2_ok_nocache { } {
                  {
                    return vfma_f32 (a, b, c);
                  }
@@ -126689,7 +137629,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		return 1
  	    }
  	}
-@@ -3144,9 +3229,9 @@ proc check_effective_target_arm_neonv2_ok { } {
+@@ -3142,9 +3227,9 @@ proc check_effective_target_arm_neonv2_ok { } {
  		check_effective_target_arm_neonv2_ok_nocache]
  }
  
@@ -126702,7 +137642,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  proc add_options_for_arm_fp16 { flags } {
      if { ! [check_effective_target_arm_fp16_ok] } {
-@@ -3156,9 +3241,32 @@ proc add_options_for_arm_fp16 { flags } {
+@@ -3154,9 +3239,32 @@ proc add_options_for_arm_fp16 { flags } {
      return "$flags $et_arm_fp16_flags"
  }
  
@@ -126736,7 +137676,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  proc check_effective_target_arm_fp16_ok_nocache { } {
      global et_arm_fp16_flags
-@@ -3166,7 +3274,10 @@ proc check_effective_target_arm_fp16_ok_nocache { } {
+@@ -3164,7 +3272,10 @@ proc check_effective_target_arm_fp16_ok_nocache { } {
      if { ! [check_effective_target_arm32] } {
  	return 0;
      }
@@ -126748,7 +137688,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	# Multilib flags would override -mfpu.
  	return 0
      }
-@@ -3202,6 +3313,28 @@ proc check_effective_target_arm_fp16_ok { } {
+@@ -3200,6 +3311,28 @@ proc check_effective_target_arm_fp16_ok { } {
  		check_effective_target_arm_fp16_ok_nocache]
  }
  
@@ -126777,19 +137717,50 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  # Creates a series of routines that return 1 if the given architecture
  # can be selected and a routine to give the flags to select that architecture
  # Note: Extra flags may be added to disable options from newer compilers
-@@ -3226,7 +3359,10 @@ foreach { armfunc armflag armdef } { v4 "-march=armv4 -marm" __ARM_ARCH_4__
- 				     v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
- 				     v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
- 				     v8a "-march=armv8-a" __ARM_ARCH_8A__
+@@ -3209,22 +3342,26 @@ proc check_effective_target_arm_fp16_ok { } {
+ # Usage: /* { dg-require-effective-target arm_arch_v5_ok } */
+ #        /* { dg-add-options arm_arch_v5 } */
+ #	 /* { dg-require-effective-target arm_arch_v5_multilib } */
+-foreach { armfunc armflag armdef } { v4 "-march=armv4 -marm" __ARM_ARCH_4__
+-				     v4t "-march=armv4t" __ARM_ARCH_4T__
+-				     v5 "-march=armv5 -marm" __ARM_ARCH_5__
+-				     v5t "-march=armv5t" __ARM_ARCH_5T__
+-				     v5te "-march=armv5te" __ARM_ARCH_5TE__
+-				     v6 "-march=armv6" __ARM_ARCH_6__
+-				     v6k "-march=armv6k" __ARM_ARCH_6K__
+-				     v6t2 "-march=armv6t2" __ARM_ARCH_6T2__
+-				     v6z "-march=armv6z" __ARM_ARCH_6Z__
+-				     v6m "-march=armv6-m -mthumb" __ARM_ARCH_6M__
+-				     v7a "-march=armv7-a" __ARM_ARCH_7A__
+-				     v7r "-march=armv7-r" __ARM_ARCH_7R__
+-				     v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
+-				     v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
+-				     v8a "-march=armv8-a" __ARM_ARCH_8A__
 -				     v8_1a "-march=armv8.1a" __ARM_ARCH_8A__ } {
-+				     v8_1a "-march=armv8.1a" __ARM_ARCH_8A__
-+				     v8_2a "-march=armv8.2a" __ARM_ARCH_8A__
-+				     v8m_base "-march=armv8-m.base -mthumb" __ARM_ARCH_8M_BASE__
-+				     v8m_main "-march=armv8-m.main -mthumb" __ARM_ARCH_8M_MAIN__ } {
++foreach { armfunc armflag armdef } {
++	v4 "-march=armv4 -marm" __ARM_ARCH_4__
++	v4t "-march=armv4t" __ARM_ARCH_4T__
++	v5 "-march=armv5 -marm" __ARM_ARCH_5__
++	v5t "-march=armv5t" __ARM_ARCH_5T__
++	v5te "-march=armv5te" __ARM_ARCH_5TE__
++	v6 "-march=armv6" __ARM_ARCH_6__
++	v6k "-march=armv6k" __ARM_ARCH_6K__
++	v6t2 "-march=armv6t2" __ARM_ARCH_6T2__
++	v6z "-march=armv6z" __ARM_ARCH_6Z__
++	v6m "-march=armv6-m -mthumb -mfloat-abi=soft" __ARM_ARCH_6M__
++	v7a "-march=armv7-a" __ARM_ARCH_7A__
++	v7r "-march=armv7-r" __ARM_ARCH_7R__
++	v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
++	v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
++	v8a "-march=armv8-a" __ARM_ARCH_8A__
++	v8_1a "-march=armv8.1a" __ARM_ARCH_8A__
++	v8_2a "-march=armv8.2a" __ARM_ARCH_8A__
++	v8m_base "-march=armv8-m.base -mthumb -mfloat-abi=soft" __ARM_ARCH_8M_BASE__
++	v8m_main "-march=armv8-m.main -mthumb" __ARM_ARCH_8M_MAIN__ } {
      eval [string map [list FUNC $armfunc FLAG $armflag DEF $armdef ] {
  	proc check_effective_target_arm_arch_FUNC_ok { } {
  	    if { [ string match "*-marm*" "FLAG" ] &&
-@@ -3354,15 +3490,47 @@ proc check_effective_target_arm_cortex_m { } {
+@@ -3352,15 +3489,60 @@ proc check_effective_target_arm_cortex_m { } {
  	return 0
      }
      return [check_no_compiler_messages arm_cortex_m assembly {
@@ -126838,10 +137809,23 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
 +    }
 +}
 +
++# Return 1 if this is an ARM target where ARMv8-M Security Extensions is
++# available.
++
++proc check_effective_target_arm_cmse_ok {} {
++    return [check_no_compiler_messages arm_cmse object {
++	int
++	foo (void)
++	{
++	  asm ("bxns r0");
++	}
++    } "-mcmse"];
++}
++
  # Return 1 if this compilation turns on string_ops_prefer_neon on.
  
  proc check_effective_target_arm_tune_string_ops_prefer_neon { } {
-@@ -3438,6 +3606,76 @@ proc check_effective_target_arm_v8_1a_neon_ok { } {
+@@ -3436,6 +3618,76 @@ proc check_effective_target_arm_v8_1a_neon_ok { } {
  		check_effective_target_arm_v8_1a_neon_ok_nocache]
  }
  
@@ -126918,7 +137902,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  # Return 1 if the target supports executing ARMv8 NEON instructions, 0
  # otherwise.
  
-@@ -3447,11 +3685,17 @@ proc check_effective_target_arm_v8_neon_hw { } {
+@@ -3445,11 +3697,17 @@ proc check_effective_target_arm_v8_neon_hw { } {
  	int
  	main (void)
  	{
@@ -126938,7 +137922,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	}
      } [add_options_for_arm_v8_neon ""]]
  }
-@@ -3494,6 +3738,81 @@ proc check_effective_target_arm_v8_1a_neon_hw { } {
+@@ -3492,6 +3750,81 @@ proc check_effective_target_arm_v8_1a_neon_hw { } {
      } [add_options_for_arm_v8_1a_neon ""]]
  }
  
@@ -127020,7 +138004,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  # Return 1 if this is a ARM target with NEON enabled.
  
  proc check_effective_target_arm_neon { } {
-@@ -3528,6 +3847,25 @@ proc check_effective_target_arm_neonv2 { } {
+@@ -3526,6 +3859,25 @@ proc check_effective_target_arm_neonv2 { } {
      }
  }
  
@@ -127046,7 +138030,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  # Return 1 if this a Loongson-2E or -2F target using an ABI that supports
  # the Loongson vector modes.
  
-@@ -4382,6 +4720,8 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
+@@ -4380,6 +4732,8 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
          set et_vect_widen_sum_hi_to_si_pattern_saved 0
          if { [istarget powerpc*-*-*]
               || [istarget aarch64*-*-*]
@@ -127055,7 +138039,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
               || [istarget ia64-*-*] } {
              set et_vect_widen_sum_hi_to_si_pattern_saved 1
          }
-@@ -5757,6 +6097,8 @@ proc check_effective_target_sync_int_long { } {
+@@ -5755,6 +6109,8 @@ proc check_effective_target_sync_int_long { } {
  	     || [istarget aarch64*-*-*]
  	     || [istarget alpha*-*-*] 
  	     || [istarget arm*-*-linux-*] 
@@ -127064,7 +138048,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	     || [istarget bfin*-*linux*]
  	     || [istarget hppa*-*linux*]
  	     || [istarget s390*-*-*] 
-@@ -5790,6 +6132,8 @@ proc check_effective_target_sync_char_short { } {
+@@ -5788,6 +6144,8 @@ proc check_effective_target_sync_char_short { } {
  	     || [istarget i?86-*-*] || [istarget x86_64-*-*]
  	     || [istarget alpha*-*-*] 
  	     || [istarget arm*-*-linux-*] 
@@ -127171,7 +138155,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  		}
  	    }
  	}
-@@ -1934,7 +1933,7 @@ evaluate_stmt (gimple *stmt)
+@@ -1939,7 +1938,7 @@ evaluate_stmt (gimple *stmt)
  	    {
  	      val.lattice_val = CONSTANT;
  	      val.value = build_zero_cst (TREE_TYPE (lhs));
@@ -127180,7 +138164,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	      is_constant = true;
  	    }
  	  else
-@@ -1945,7 +1944,8 @@ evaluate_stmt (gimple *stmt)
+@@ -1950,7 +1949,8 @@ evaluate_stmt (gimple *stmt)
  	      if (nonzero_bits == 0)
  		val.mask = 0;
  	      else
@@ -127192,7 +138176,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
      }
 --- a/src/gcc/tree-ssa-strlen.c
 +++ b/src/gcc/tree-ssa-strlen.c
-@@ -2260,7 +2260,7 @@ public:
+@@ -2263,7 +2263,7 @@ public:
  };
  
  /* Callback for walk_dominator_tree.  Attempt to optimize various
@@ -127203,7 +138187,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  strlen_dom_walker::before_dom_children (basic_block bb)
 --- a/src/gcc/tree-vect-data-refs.c
 +++ b/src/gcc/tree-vect-data-refs.c
-@@ -2238,6 +2238,7 @@ vect_analyze_group_access_1 (struct data_reference *dr)
+@@ -2250,6 +2250,7 @@ vect_analyze_group_access_1 (struct data_reference *dr)
  	{
  	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
  	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
@@ -127669,7 +138653,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    /* Pattern detected.  */
 --- a/src/gcc/tree-vect-stmts.c
 +++ b/src/gcc/tree-vect-stmts.c
-@@ -6323,12 +6323,22 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
+@@ -6354,12 +6354,22 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
        gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
  
        first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
@@ -127693,7 +138677,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        if (first_stmt == stmt
  	  && !GROUP_NEXT_ELEMENT (stmt_info))
  	{
-@@ -6342,7 +6352,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
+@@ -6373,7 +6383,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
  	    }
  
  	  /* Single-element interleaving requires peeling for gaps.  */
@@ -127702,7 +138686,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	}
  
        /* If there is a gap in the end of the group or the group size cannot
-@@ -6350,9 +6360,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
+@@ -6381,9 +6391,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
  	 elements in the last iteration and thus need to peel that off.  */
        if (loop_vinfo
  	  && ! STMT_VINFO_STRIDED_P (stmt_info)
@@ -127714,7 +138698,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	{
  	  if (dump_enabled_p ())
  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-@@ -6372,8 +6381,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
+@@ -6403,8 +6412,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
  	slp_perm = true;
  
@@ -127723,7 +138707,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
        /* ???  The following is overly pessimistic (as well as the loop
           case above) in the case we can statically determine the excess
  	 elements loaded are within the bounds of a decl that is accessed.
-@@ -6386,16 +6393,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
+@@ -6417,16 +6424,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
  	  return false;
  	}
  
@@ -127901,7 +138885,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
 --- a/src/gcc/tree-vrp.c
 +++ b/src/gcc/tree-vrp.c
-@@ -3130,6 +3130,24 @@ extract_range_from_binary_expr_1 (value_range *vr,
+@@ -3147,6 +3147,24 @@ extract_range_from_binary_expr_1 (value_range *vr,
  	  if (int_cst_range1 && tree_int_cst_sgn (vr1.min) >= 0)
  	    wmax = wi::min (wmax, vr1.max, TYPE_SIGN (expr_type));
  	  max = wide_int_to_tree (expr_type, wmax);
@@ -127926,7 +138910,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	}
        else if (code == BIT_IOR_EXPR)
  	{
-@@ -3824,7 +3842,8 @@ extract_range_basic (value_range *vr, gimple *stmt)
+@@ -3841,7 +3859,8 @@ extract_range_basic (value_range *vr, gimple *stmt)
  	  arg = gimple_call_arg (stmt, 0);
  	  if (TREE_CODE (arg) == SSA_NAME
  	      && SSA_NAME_IS_DEFAULT_DEF (arg)
@@ -127936,7 +138920,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  	    {
  	      set_value_range_to_null (vr, type);
  	      return;
-@@ -9906,6 +9925,40 @@ simplify_internal_call_using_ranges (gimple_stmt_iterator *gsi, gimple *stmt)
+@@ -9919,6 +9938,40 @@ simplify_internal_call_using_ranges (gimple_stmt_iterator *gsi, gimple *stmt)
    return true;
  }
  
@@ -127977,7 +138961,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  /* Simplify STMT using ranges if possible.  */
  
  static bool
-@@ -9916,6 +9969,68 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
+@@ -9929,6 +9982,68 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
      {
        enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
        tree rhs1 = gimple_assign_rhs1 (stmt);
@@ -128046,19 +139030,102 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
        switch (rhs_code)
  	{
+--- a/src/gcc/tree.h
++++ b/src/gcc/tree.h
+@@ -4628,69 +4628,6 @@ extern void warn_deprecated_use (tree, tree);
+ extern void cache_integer_cst (tree);
+ extern const char *combined_fn_name (combined_fn);
+ 
+-/* Return the memory model from a host integer.  */
+-static inline enum memmodel
+-memmodel_from_int (unsigned HOST_WIDE_INT val)
+-{
+-  return (enum memmodel) (val & MEMMODEL_MASK);
+-}
+-
+-/* Return the base memory model from a host integer.  */
+-static inline enum memmodel
+-memmodel_base (unsigned HOST_WIDE_INT val)
+-{
+-  return (enum memmodel) (val & MEMMODEL_BASE_MASK);
+-}
+-
+-/* Return TRUE if the memory model is RELAXED.  */
+-static inline bool
+-is_mm_relaxed (enum memmodel model)
+-{
+-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELAXED;
+-}
+-
+-/* Return TRUE if the memory model is CONSUME.  */
+-static inline bool
+-is_mm_consume (enum memmodel model)
+-{
+-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_CONSUME;
+-}
+-
+-/* Return TRUE if the memory model is ACQUIRE.  */
+-static inline bool
+-is_mm_acquire (enum memmodel model)
+-{
+-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQUIRE;
+-}
+-
+-/* Return TRUE if the memory model is RELEASE.  */
+-static inline bool
+-is_mm_release (enum memmodel model)
+-{
+-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELEASE;
+-}
+-
+-/* Return TRUE if the memory model is ACQ_REL.  */
+-static inline bool
+-is_mm_acq_rel (enum memmodel model)
+-{
+-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQ_REL;
+-}
+-
+-/* Return TRUE if the memory model is SEQ_CST.  */
+-static inline bool
+-is_mm_seq_cst (enum memmodel model)
+-{
+-  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_SEQ_CST;
+-}
+-
+-/* Return TRUE if the memory model is a SYNC variant.  */
+-static inline bool
+-is_mm_sync (enum memmodel model)
+-{
+-  return (model & MEMMODEL_SYNC);
+-}
+-
+ /* Compare and hash for any structure which begins with a canonical
+    pointer.  Assumes all pointers are interchangeable, which is sort
+    of already assumed by gcc elsewhere IIRC.  */
+--- a/src/gcc/tsan.c
++++ b/src/gcc/tsan.c
+@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "backend.h"
+ #include "rtl.h"
+ #include "tree.h"
++#include "memmodel.h"
+ #include "gimple.h"
+ #include "tree-pass.h"
+ #include "ssa.h"
 --- a/src/gcc/varasm.c
 +++ b/src/gcc/varasm.c
-@@ -6772,6 +6772,15 @@ default_use_anchors_for_symbol_p (const_rtx symbol)
+@@ -6776,6 +6776,16 @@ default_use_anchors_for_symbol_p (const_rtx symbol)
  	 sections that should be marked as small in the section directive.  */
        if (targetm.in_small_data_p (decl))
  	return false;
 +
 +      /* Don't use section anchors for decls that won't fit inside a single
-+	 anchor range to reduce the amount of instructions require to refer
++	 anchor range to reduce the amount of instructions required to refer
 +	 to the entire declaration.  */
-+      if (decl && DECL_SIZE (decl)
-+	 && tree_to_shwi (DECL_SIZE (decl))
-+	    >= (targetm.max_anchor_offset * BITS_PER_UNIT))
++      if (DECL_SIZE_UNIT (decl) == NULL_TREE
++	  || !tree_fits_uhwi_p (DECL_SIZE_UNIT (decl))
++	  || (tree_to_uhwi (DECL_SIZE_UNIT (decl))
++	      >= (unsigned HOST_WIDE_INT) targetm.max_anchor_offset))
 +	return false;
 +
      }
@@ -128089,6 +139156,20 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  # The floating-point conversion routines that involve a single-word integer.
  # XX stands for the integer mode.
+--- a/src/libgcc/config.host
++++ b/src/libgcc/config.host
+@@ -333,6 +333,11 @@ aarch64*-*-elf | aarch64*-*-rtems*)
+ 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+ 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+ 	;;
++aarch64*-*-freebsd*)
++	extra_parts="$extra_parts crtfastmath.o"
++	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
++	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
++	;;
+ aarch64*-*-linux*)
+ 	extra_parts="$extra_parts crtfastmath.o"
+ 	md_unwind_header=aarch64/linux-unwind.h
 --- a/src/libgcc/config/arm/bpabi-v6m.S
 +++ b/src/libgcc/config/arm/bpabi-v6m.S
 @@ -1,4 +1,5 @@
@@ -128098,6 +139179,251 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
     Copyright (C) 2006-2016 Free Software Foundation, Inc.
     Contributed by CodeSourcery.
+--- /dev/null
++++ b/src/libgcc/config/arm/cmse.c
+@@ -0,0 +1,108 @@
++/* ARMv8-M Security Extensions routines.
++   Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by ARM Ltd.
++
++   This file is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by the
++   Free Software Foundation; either version 3, or (at your option) any
++   later version.
++
++   This file is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++
++#if __ARM_FEATURE_CMSE & 1
++
++#include <arm_cmse.h>
++
++/* ARM intrinsic function to perform a permission check on a given
++   address range.  See ACLE changes for ARMv8-M.  */
++
++void *
++cmse_check_address_range (void *p, size_t size, int flags)
++{
++  cmse_address_info_t permb, perme;
++  char *pb = (char *) p, *pe;
++
++  /* Check if the range wraps around.  */
++  if (UINTPTR_MAX - (uintptr_t) p < size)
++    return NULL;
++
++  /* Check if an unknown flag is present.  */
++  int known = CMSE_MPU_UNPRIV | CMSE_MPU_READWRITE | CMSE_MPU_READ;
++  int known_secure_level = CMSE_MPU_UNPRIV;
++#if __ARM_FEATURE_CMSE & 2
++  known |= CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE;
++  known_secure_level |= CMSE_MPU_NONSECURE;
++#endif
++  if (flags & (~known))
++    return NULL;
++
++  /* Execute the right variant of the TT instructions.  */
++  pe = pb + size - 1;
++  const int singleCheck = (((uintptr_t) pb ^ (uintptr_t) pe) < 32);
++  switch (flags & known_secure_level)
++    {
++    case 0:
++      permb = cmse_TT (pb);
++      perme = singleCheck ? permb : cmse_TT (pe);
++      break;
++    case CMSE_MPU_UNPRIV:
++      permb = cmse_TTT (pb);
++      perme = singleCheck ? permb : cmse_TTT (pe);
++      break;
++#if __ARM_FEATURE_CMSE & 2
++    case CMSE_MPU_NONSECURE:
++      permb = cmse_TTA (pb);
++      perme = singleCheck ? permb : cmse_TTA (pe);
++      break;
++    case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
++      permb = cmse_TTAT (pb);
++      perme = singleCheck ? permb : cmse_TTAT (pe);
++      break;
++#endif
++    default:
++      /* Invalid flag, eg.  CMSE_MPU_NONSECURE specified but
++	 __ARM_FEATURE_CMSE & 2 == 0.  */
++      return NULL;
++    }
++
++  /* Check that the range does not cross MPU, SAU, or IDAU boundaries.  */
++  if (permb.value != perme.value)
++    return NULL;
++
++  /* Check the permissions on the range.  */
++  switch (flags & (~known_secure_level))
++    {
++#if __ARM_FEATURE_CMSE & 2
++    case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
++    case		 CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
++      return permb.flags.nonsecure_readwrite_ok	? p : NULL;
++    case CMSE_MPU_READ | CMSE_AU_NONSECURE:
++      return permb.flags.nonsecure_read_ok	? p : NULL;
++    case CMSE_AU_NONSECURE:
++      return permb.flags.secure			? NULL : p;
++#endif
++    case CMSE_MPU_READ | CMSE_MPU_READWRITE:
++    case		 CMSE_MPU_READWRITE:
++      return permb.flags.readwrite_ok		? p : NULL;
++    case CMSE_MPU_READ:
++      return permb.flags.read_ok		? p : NULL;
++    default:
++      return NULL;
++    }
++}
++
++
++#endif /* __ARM_FEATURE_CMSE & 1.  */
+--- /dev/null
++++ b/src/libgcc/config/arm/cmse_nonsecure_call.S
+@@ -0,0 +1,131 @@
++/* CMSE wrapper function used to save, clear and restore callee saved registers
++   for cmse_nonsecure_call's.
++
++   Copyright (C) 2016 Free Software Foundation, Inc.
++   Contributed by ARM Ltd.
++
++   This file is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by the
++   Free Software Foundation; either version 3, or (at your option) any
++   later version.
++
++   This file is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++.syntax unified
++.thumb
++.global __gnu_cmse_nonsecure_call
++__gnu_cmse_nonsecure_call:
++#if defined(__ARM_ARCH_8M_MAIN__)
++push	    {r5-r11,lr}
++mov	    r7, r4
++mov	    r8, r4
++mov	    r9, r4
++mov	    r10, r4
++mov	    r11, r4
++mov	    ip, r4
++
++/* Save and clear callee-saved registers only if we are dealing with hard float
++   ABI.  The unused caller-saved registers have already been cleared by GCC
++   generated code.  */
++#ifdef __ARM_PCS_VFP
++vpush.f64   {d8-d15}
++mov	    r5, #0
++vmov	    d8, r5, r5
++#if __ARM_FP & 0x04
++vmov	    s18, s19, r5, r5
++vmov	    s20, s21, r5, r5
++vmov	    s22, s23, r5, r5
++vmov	    s24, s25, r5, r5
++vmov	    s26, s27, r5, r5
++vmov	    s28, s29, r5, r5
++vmov	    s30, s31, r5, r5
++#elif __ARM_FP & 0x08
++vmov.f64    d9, d8
++vmov.f64    d10, d8
++vmov.f64    d11, d8
++vmov.f64    d12, d8
++vmov.f64    d13, d8
++vmov.f64    d14, d8
++vmov.f64    d15, d8
++#else
++#error "Half precision implementation not supported."
++#endif
++/* Clear the cumulative exception-status bits (0-4,7) and the
++   condition code bits (28-31) of the FPSCR.  */
++vmrs	    r5, fpscr
++movw	    r6, #65376
++movt	    r6, #4095
++ands	    r5, r6
++vmsr	    fpscr, r5
++
++/* We are not dealing with hard float ABI, so we can safely use the vlstm and
++   vlldm instructions without needing to preserve the registers used for
++   argument passing.  */
++#else
++sub	    sp, sp, #0x88 /* Reserve stack space to save all floating point
++			     registers, including FPSCR.  */
++vlstm	    sp		  /* Lazy store and clearance of d0-d16 and FPSCR.  */
++#endif /* __ARM_PCS_VFP */
++
++/* Make sure to clear the 'GE' bits of the APSR register if 32-bit SIMD
++   instructions are available.  */
++#if defined(__ARM_FEATURE_SIMD32)
++msr	    APSR_nzcvqg, r4
++#else
++msr	    APSR_nzcvq, r4
++#endif
++
++mov	    r5, r4
++mov	    r6, r4
++blxns	    r4
++
++#ifdef __ARM_PCS_VFP
++vpop.f64    {d8-d15}
++#else
++vlldm	    sp		  /* Lazy restore of d0-d16 and FPSCR.  */
++add	    sp, sp, #0x88 /* Free space used to save floating point registers.  */
++#endif /* __ARM_PCS_VFP */
++
++pop	    {r5-r11, pc}
++
++#elif defined (__ARM_ARCH_8M_BASE__)
++push	    {r5-r7, lr}
++mov	    r5, r8
++mov	    r6, r9
++mov	    r7, r10
++push	    {r5-r7}
++mov	    r5, r11
++push	    {r5}
++mov	    r5, r4
++mov	    r6, r4
++mov	    r7, r4
++mov	    r8, r4
++mov	    r9, r4
++mov	    r10, r4
++mov	    r11, r4
++mov	    ip, r4
++msr	    APSR_nzcvq, r4
++blxns	    r4
++pop	    {r5}
++mov	    r11, r5
++pop	    {r5-r7}
++mov	    r10, r7
++mov	    r9, r6
++mov	    r8, r5
++pop	    {r5-r7, pc}
++
++#else
++#error "This should only be used for armv8-m base- and mainline."
++#endif
 --- a/src/libgcc/config/arm/ieee754-df.S
 +++ b/src/libgcc/config/arm/ieee754-df.S
 @@ -160,8 +160,8 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3
@@ -128338,6 +139664,26 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
  UNWIND_WRAPPER _Unwind_RaiseException 1
  UNWIND_WRAPPER _Unwind_Resume 1
+--- a/src/libgcc/config/arm/t-arm
++++ b/src/libgcc/config/arm/t-arm
+@@ -1,3 +1,17 @@
+ LIB1ASMSRC = arm/lib1funcs.S
+ LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \
+ 	_thumb1_case_uhi _thumb1_case_si
++
++HAVE_CMSE:=$(findstring __ARM_FEATURE_CMSE,$(shell $(gcc_compile_bare) -dM -E - </dev/null))
++ifneq ($(shell $(gcc_compile_bare) -E -mcmse - </dev/null 2>/dev/null),)
++CMSE_OPTS:=-mcmse
++endif
++
++ifdef HAVE_CMSE
++libgcc-objects += cmse.o cmse_nonsecure_call.o
++
++cmse.o: $(srcdir)/config/arm/cmse.c
++	$(gcc_compile) -c $(CMSE_OPTS) $<
++cmse_nonsecure_call.o: $(srcdir)/config/arm/cmse_nonsecure_call.S
++		       $(gcc_compile) -c $<
++endif
 --- a/src/libgcc/config/arm/t-softfp
 +++ b/src/libgcc/config/arm/t-softfp
 @@ -1,2 +1,2 @@
@@ -128444,7 +139790,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
    fi
 --- a/src/libstdc++-v3/configure
 +++ b/src/libstdc++-v3/configure
-@@ -79456,8 +79456,7 @@ $as_echo "$ac_cv_x86_rdrand" >&6; }
+@@ -79518,8 +79518,7 @@ $as_echo "$ac_cv_x86_rdrand" >&6; }
  
  # This depends on GLIBCXX_ENABLE_SYMVERS and GLIBCXX_IS_NATIVE.
  
@@ -128454,7 +139800,7 @@ LANG=C git diff --no-renames 70232cbbcab57eecc73626f3ea0e13bdfa00202d..bc32472ee
  
    setrlimit_have_headers=yes
    for ac_header in unistd.h sys/time.h sys/resource.h
-@@ -79686,6 +79685,7 @@ $as_echo "#define _GLIBCXX_RES_LIMITS 1" >>confdefs.h
+@@ -79748,6 +79747,7 @@ $as_echo "#define _GLIBCXX_RES_LIMITS 1" >>confdefs.h
  $as_echo "$ac_res_limits" >&6; }
  
  
diff --git a/debian/patches/linaro-issue2575.diff b/debian/patches/linaro-issue2575.diff
deleted file mode 100644
index 97bf780..0000000
--- a/debian/patches/linaro-issue2575.diff
+++ /dev/null
@@ -1,16 +0,0 @@
-# DP: Fix ICE in tree_to_shwi, Linaro issue #2575.
-
---- a/src/gcc/varasm.c
-+++ b/src/gcc/varasm.c
-@@ -6777,8 +6777,9 @@
- 	 anchor range to reduce the amount of instructions require to refer
- 	 to the entire declaration.  */
-       if (decl && DECL_SIZE (decl)
--	 && tree_to_shwi (DECL_SIZE (decl))
--	    >= (targetm.max_anchor_offset * BITS_PER_UNIT))
-+         && (!tree_fits_shwi_p (DECL_SIZE (decl))
-+             || tree_to_shwi (DECL_SIZE (decl))
-+                >= (targetm.max_anchor_offset * BITS_PER_UNIT)))
- 	return false;
- 
-     }
diff --git a/debian/rules.patch b/debian/rules.patch
index 92c3926..62cb094 100644
--- a/debian/rules.patch
+++ b/debian/rules.patch
@@ -15,11 +15,9 @@ series_file	?= $(patchdir)/series
 debian_patches = \
 	svn-updates \
 	libiberty-updates \
-	$(if $(with_linaro_branch),gcc-linaro-r244161-revert) \
-	$(if $(with_linaro_branch),gcc-linaro-r243646-revert) \
+	$(if $(with_linaro_branch),gcc-linaro-r244242-revert) \
 	$(if $(with_linaro_branch),gcc-linaro) \
 	$(if $(with_linaro_branch),gcc-linaro-no-macros) \
-	$(if $(with_linaro_branch),linaro-issue2575) \
 
 #	svn-updates \
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/gcc-6.git