[gcc-6] 195/401: * Update the Linaro support to the 6-2016.08 snapshot.

Wed Apr 5 15:49:18 UTC 2017

This is an automated email from the git hooks/post-receive script.

infinity0 pushed a commit to branch pu/reproducible_builds
in repository gcc-6.

commit 7688e557f84af11294e3ce6a84750049d7e1c5b2
Author: doko <doko at 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca>
Date:   Tue Aug 30 17:43:43 2016 +0000

      * Update the Linaro support to the 6-2016.08 snapshot.
    
    
    git-svn-id: svn://anonscm.debian.org/gcccvs/branches/sid/gcc-6@8959 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca
---
 debian/changelog                         |    1 +
 debian/patches/gcc-linaro-doc.diff       |   58 +-
 debian/patches/gcc-linaro-no-macros.diff |    2 +-
 debian/patches/gcc-linaro.diff           | 4056 ++++++++++++++++++++++++++++--
 debian/patches/vulcan-costs.diff         |  259 --
 debian/patches/vulcan-cpu-doc.diff       |   27 -
 debian/patches/vulcan-cpu.diff           |   39 -
 debian/rules.patch                       |    3 -
 8 files changed, 3911 insertions(+), 534 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index f09147a..d063b15 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -9,6 +9,7 @@ gcc-6 (6.2.0-2) UNRELEASED; urgency=medium
   * Fix install location of D header files for cross builds (YunQiang Su).
     Closes: #835847.
   * Fix PR c++/77379, taken from the trunk.
+  * Update the Linaro support to the 6-2016.08 snapshot.
 
  -- Matthias Klose <doko at debian.org>  Wed, 24 Aug 2016 08:07:34 +0200
 
diff --git a/debian/patches/gcc-linaro-doc.diff b/debian/patches/gcc-linaro-doc.diff
index 2909dd6..dd2b39c 100644
--- a/debian/patches/gcc-linaro-doc.diff
+++ b/debian/patches/gcc-linaro-doc.diff
@@ -1,4 +1,4 @@
-# DP: Changes for the Linaro 6-2016.07 release (documentation).
+# DP: Changes for the Linaro 6-2016.08 release (documentation).
 
 --- a/src/gcc/doc/cpp.texi
 +++ b/src/gcc/doc/cpp.texi
@@ -13,7 +13,16 @@
  like this:
 --- a/src/gcc/doc/invoke.texi
 +++ b/src/gcc/doc/invoke.texi
-@@ -9478,6 +9478,11 @@ Size of minimal partition for WHOPR (in estimated instructions).
+@@ -573,6 +573,8 @@ Objective-C and Objective-C++ Dialects}.
+ -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
+ -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+ -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt at gol
++-mlow-precision-sqrt -mno-low-precision-sqrt at gol
++-mlow-precision-div -mno-low-precision-div @gol
+ -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
+ 
+ @emph{Adapteva Epiphany Options}
+@@ -9478,6 +9480,11 @@ Size of minimal partition for WHOPR (in estimated instructions).
  This prevents expenses of splitting very small programs into too many
  partitions.
  
@@ -25,7 +34,7 @@
  @item cxx-max-namespaces-for-diagnostic-help
  The maximum number of namespaces to consult for suggestions when C++
  name lookup fails for an identifier.  The default is 1000.
-@@ -12828,9 +12833,9 @@ These options are defined for AArch64 implementations:
+@@ -12828,9 +12835,9 @@ These options are defined for AArch64 implementations:
  @item -mabi=@var{name}
  @opindex mabi
  Generate code for the specified data model.  Permissible values
@@ -38,7 +47,7 @@
  
  The default depends on the specific target configuration.  Note that
  the LP64 and ILP32 ABIs are not link-compatible; you must compile your
-@@ -12855,25 +12860,24 @@ Generate little-endian code.  This is the default when GCC is configured for an
+@@ -12855,25 +12862,24 @@ Generate little-endian code.  This is the default when GCC is configured for an
  @item -mcmodel=tiny
  @opindex mcmodel=tiny
  Generate code for the tiny code model.  The program and its statically defined
@@ -71,7 +80,7 @@
  
  @item -momit-leaf-frame-pointer
  @itemx -mno-omit-leaf-frame-pointer
-@@ -12895,7 +12899,7 @@ of TLS variables.
+@@ -12895,7 +12901,7 @@ of TLS variables.
  @item -mtls-size=@var{size}
  @opindex mtls-size
  Specify bit size of immediate TLS offsets.  Valid values are 12, 24, 32, 48.
@@ -80,7 +89,7 @@
  
  @item -mfix-cortex-a53-835769
  @itemx -mno-fix-cortex-a53-835769
-@@ -12915,12 +12919,13 @@ corresponding flag to the linker.
+@@ -12915,12 +12921,34 @@ corresponding flag to the linker.
  
  @item -mlow-precision-recip-sqrt
  @item -mno-low-precision-recip-sqrt
@@ -92,21 +101,42 @@
 -approximation, which in turn depends on the target processor.
 + at opindex mlow-precision-recip-sqrt
 + at opindex mno-low-precision-recip-sqrt
-+Enable or disable reciprocal square root approximation.
++Enable or disable the reciprocal square root approximation.
 +This option only has an effect if @option{-ffast-math} or
 + at option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
 +precision of reciprocal square root results to about 16 bits for
 +single precision and to 32 bits for double precision.
++
++ at item -mlow-precision-sqrt
++ at item -mno-low-precision-sqrt
++ at opindex -mlow-precision-sqrt
++ at opindex -mno-low-precision-sqrt
++Enable or disable the square root approximation.
++This option only has an effect if @option{-ffast-math} or
++ at option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
++precision of square root results to about 16 bits for
++single precision and to 32 bits for double precision.
++If enabled, it implies @option{-mlow-precision-recip-sqrt}.
++
++ at item -mlow-precision-div
++ at item -mno-low-precision-div
++ at opindex -mlow-precision-div
++ at opindex -mno-low-precision-div
++Enable or disable the division approximation.
++This option only has an effect if @option{-ffast-math} or
++ at option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
++precision of division results to about 16 bits for
++single precision and to 32 bits for double precision.
  
  @item -march=@var{name}
  @opindex march
-@@ -12957,17 +12962,15 @@ Specify the name of the target processor for which GCC should tune the
+@@ -12957,17 +12985,15 @@ Specify the name of the target processor for which GCC should tune the
  performance of the code.  Permissible values for this option are:
  @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a57},
  @samp{cortex-a72}, @samp{exynos-m1}, @samp{qdf24xx}, @samp{thunderx},
 - at samp{xgene1}.
-+ at samp{xgene1}, @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
-+ at samp{native}.
++ at samp{xgene1}, @samp{vulcan}, @samp{cortex-a57.cortex-a53},
++ at samp{cortex-a72.cortex-a53}, @samp{native}.
  
 -Additionally, this option can specify that GCC should tune the performance
 -of the code for a big.LITTLE system.  Permissible values for this
@@ -124,7 +154,7 @@
  
  Where none of @option{-mtune=}, @option{-mcpu=} or @option{-march=}
  are specified, the code is tuned to perform well across a range
-@@ -12987,12 +12990,6 @@ documented in the sub-section on
+@@ -12987,12 +13013,6 @@ documented in the sub-section on
  Feature Modifiers}.  Where conflicting feature modifiers are
  specified, the right-most feature is used.
  
@@ -137,7 +167,7 @@
  GCC uses @var{name} to determine what kind of instructions it can emit when
  generating assembly code (as if by @option{-march}) and to determine
  the target processor for which to tune for performance (as if
-@@ -13010,11 +13007,11 @@ across releases.
+@@ -13010,11 +13030,11 @@ across releases.
  This option is only intended to be useful when developing GCC.
  
  @item -mpc-relative-literal-loads
@@ -154,7 +184,7 @@
  
  @end table
  
-@@ -13045,9 +13042,9 @@ Enable Large System Extension instructions.  This is on by default for
+@@ -13045,9 +13065,9 @@ Enable Large System Extension instructions.  This is on by default for
  
  @end table
  
@@ -167,7 +197,7 @@
  
  @node Adapteva Epiphany Options
  @subsection Adapteva Epiphany Options
-@@ -18082,7 +18079,7 @@ IEEE 754 floating-point data.
+@@ -18082,7 +18102,7 @@ IEEE 754 floating-point data.
  
  The @option{-mnan=legacy} option selects the legacy encoding.  In this
  case quiet NaNs (qNaNs) are denoted by the first bit of their trailing
diff --git a/debian/patches/gcc-linaro-no-macros.diff b/debian/patches/gcc-linaro-no-macros.diff
index df3d913..9da5f40 100644
--- a/debian/patches/gcc-linaro-no-macros.diff
+++ b/debian/patches/gcc-linaro-no-macros.diff
@@ -89,7 +89,7 @@ Index: b/src/gcc/LINARO-VERSION
 --- a/src/gcc/LINARO-VERSION
 +++ /dev/null
 @@ -1 +0,0 @@
--Snapshot 6.1-2016.07
+-6.1-2016.08~dev
 Index: b/src/gcc/configure.ac
 ===================================================================
 --- a/src/gcc/configure.ac
diff --git a/debian/patches/gcc-linaro.diff b/debian/patches/gcc-linaro.diff
index 2ad91f2..3494a03 100644
--- a/debian/patches/gcc-linaro.diff
+++ b/debian/patches/gcc-linaro.diff
@@ -1,8 +1,8 @@
-# DP: Changes for the Linaro 6-2016.07 release.
+# DP: Changes for the Linaro 6-2016.08 release.
 
 MSG=$(git log origin/linaro/gcc-6-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-6-branch --format=format:"%H" -n 1 --grep "gcc-6-branch@${SVN%.}"
 
-LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b8175015c74b948ff1e32197 \
+LANG=C git diff ac6fe0ee825550e1dfefffd649d49133011d5eb8..91b11ff9859dee06a84ac410a5588dd1faf3462a \
  | egrep -v '^(diff|index) ' \
  | filterdiff --strip=1 --addoldprefix=a/src/  --addnewprefix=b/src/ \
  | sed 's,a/src//dev/null,/dev/null,'
@@ -10,7 +10,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 --- /dev/null
 +++ b/src/gcc/LINARO-VERSION
 @@ -0,0 +1 @@
-+Snapshot 6.1-2016.07
++6.1-2016.08~dev
 --- a/src/gcc/Makefile.in
 +++ b/src/gcc/Makefile.in
 @@ -832,10 +832,12 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x.y
@@ -114,6 +114,143 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  		fi
  		;;
  
+--- a/src/gcc/config/aarch64/aarch64-builtins.c
++++ b/src/gcc/config/aarch64/aarch64-builtins.c
+@@ -173,6 +173,10 @@ aarch64_types_shift_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_unsigned, qualifier_none, qualifier_immediate };
+ #define TYPES_SHIFTIMM_USS (aarch64_types_shift_to_unsigned_qualifiers)
+ static enum aarch64_type_qualifiers
++aarch64_types_fcvt_from_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_none, qualifier_unsigned, qualifier_immediate };
++#define TYPES_FCVTIMM_SUS (aarch64_types_fcvt_from_unsigned_qualifiers)
++static enum aarch64_type_qualifiers
+ aarch64_types_unsigned_shift_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate };
+ #define TYPES_USHIFTIMM (aarch64_types_unsigned_shift_qualifiers)
+--- a/src/gcc/config/aarch64/aarch64-cores.def
++++ b/src/gcc/config/aarch64/aarch64-cores.def
+@@ -49,6 +49,10 @@ AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AA
+ AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
+ AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
+ 
++/* V8.1 Architecture Processors.  */
++
++AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, vulcan, "0x42", "0x516")
++
+ /* V8 big.LITTLE implementations.  */
+ 
+ AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07.0xd03")
+--- a/src/gcc/config/aarch64/aarch64-cost-tables.h
++++ b/src/gcc/config/aarch64/aarch64-cost-tables.h
+@@ -127,6 +127,108 @@ const struct cpu_cost_table thunderx_extra_costs =
+   }
+ };
+ 
++const struct cpu_cost_table vulcan_extra_costs =
++{
++  /* ALU */
++  {
++    0,			/* Arith.  */
++    0,			/* Logical.  */
++    0,			/* Shift.  */
++    0,			/* Shift_reg.  */
++    COSTS_N_INSNS (1),	/* Arith_shift.  */
++    COSTS_N_INSNS (1),	/* Arith_shift_reg.  */
++    COSTS_N_INSNS (1),	/* Log_shift.  */
++    COSTS_N_INSNS (1),	/* Log_shift_reg.  */
++    0,			/* Extend.  */
++    COSTS_N_INSNS (1),	/* Extend_arith.  */
++    0,			/* Bfi.  */
++    0,			/* Bfx.  */
++    COSTS_N_INSNS (3),	/* Clz.  */
++    0,			/* Rev.  */
++    0,			/* Non_exec.  */
++    true		/* Non_exec_costs_exec.  */
++  },
++  {
++    /* MULT SImode */
++    {
++      COSTS_N_INSNS (4),	/* Simple.  */
++      COSTS_N_INSNS (4),	/* Flag_setting.  */
++      COSTS_N_INSNS (4),	/* Extend.  */
++      COSTS_N_INSNS (5),	/* Add.  */
++      COSTS_N_INSNS (5),	/* Extend_add.  */
++      COSTS_N_INSNS (18)	/* Idiv.  */
++    },
++    /* MULT DImode */
++    {
++      COSTS_N_INSNS (4),       /* Simple.  */
++      0,                       /* Flag_setting.  */
++      COSTS_N_INSNS (4),       /* Extend.  */
++      COSTS_N_INSNS (5),       /* Add.  */
++      COSTS_N_INSNS (5),       /* Extend_add.  */
++      COSTS_N_INSNS (26)       /* Idiv.  */
++    }
++  },
++  /* LD/ST */
++  {
++    COSTS_N_INSNS (4),	/* Load.  */
++    COSTS_N_INSNS (4),	/* Load_sign_extend.  */
++    COSTS_N_INSNS (5),	/* Ldrd.  */
++    COSTS_N_INSNS (4),	/* Ldm_1st.  */
++    1,			/* Ldm_regs_per_insn_1st.  */
++    1,			/* Ldm_regs_per_insn_subsequent.  */
++    COSTS_N_INSNS (4),	/* Loadf.  */
++    COSTS_N_INSNS (4),	/* Loadd.  */
++    COSTS_N_INSNS (4),	/* Load_unaligned.  */
++    0,			/* Store.  */
++    0,			/* Strd.  */
++    0,			/* Stm_1st.  */
++    1,			/* Stm_regs_per_insn_1st.  */
++    1,			/* Stm_regs_per_insn_subsequent.  */
++    0,			/* Storef.  */
++    0,			/* Stored.  */
++    0,			/* Store_unaligned.  */
++    COSTS_N_INSNS (1),	/* Loadv.  */
++    COSTS_N_INSNS (1)	/* Storev.  */
++  },
++  {
++    /* FP SFmode */
++    {
++      COSTS_N_INSNS (4),	/* Div.  */
++      COSTS_N_INSNS (1),	/* Mult.  */
++      COSTS_N_INSNS (1),	/* Mult_addsub. */
++      COSTS_N_INSNS (1),	/* Fma.  */
++      COSTS_N_INSNS (1),	/* Addsub.  */
++      COSTS_N_INSNS (1),	/* Fpconst. */
++      COSTS_N_INSNS (1),	/* Neg.  */
++      COSTS_N_INSNS (1),	/* Compare.  */
++      COSTS_N_INSNS (2),	/* Widen.  */
++      COSTS_N_INSNS (2),	/* Narrow.  */
++      COSTS_N_INSNS (2),	/* Toint.  */
++      COSTS_N_INSNS (2),	/* Fromint.  */
++      COSTS_N_INSNS (2) 	/* Roundint.  */
++    },
++    /* FP DFmode */
++    {
++      COSTS_N_INSNS (6),	/* Div.  */
++      COSTS_N_INSNS (1),	/* Mult.  */
++      COSTS_N_INSNS (1),	/* Mult_addsub.  */
++      COSTS_N_INSNS (1),	/* Fma.  */
++      COSTS_N_INSNS (1),	/* Addsub.  */
++      COSTS_N_INSNS (1),	/* Fpconst.  */
++      COSTS_N_INSNS (1),	/* Neg.  */
++      COSTS_N_INSNS (1),	/* Compare.  */
++      COSTS_N_INSNS (2),	/* Widen.  */
++      COSTS_N_INSNS (2),	/* Narrow.  */
++      COSTS_N_INSNS (2),	/* Toint.  */
++      COSTS_N_INSNS (2),	/* Fromint.  */
++      COSTS_N_INSNS (2) 	/* Roundint.  */
++    }
++  },
++  /* Vector */
++  {
++    COSTS_N_INSNS (1)	/* Alu.  */
++  }
++};
+ 
+ 
+ #endif
 --- a/src/gcc/config/aarch64/aarch64-elf.h
 +++ b/src/gcc/config/aarch64/aarch64-elf.h
 @@ -25,15 +25,6 @@
@@ -145,7 +282,46 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
 --- a/src/gcc/config/aarch64/aarch64-protos.h
 +++ b/src/gcc/config/aarch64/aarch64-protos.h
-@@ -290,6 +290,7 @@ bool aarch64_constant_address_p (rtx);
+@@ -178,6 +178,25 @@ struct cpu_branch_cost
+   const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
+ };
+ 
++/* Control approximate alternatives to certain FP operators.  */
++#define AARCH64_APPROX_MODE(MODE) \
++  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
++   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
++   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
++     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
++	      + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
++     : (0))
++#define AARCH64_APPROX_NONE (0)
++#define AARCH64_APPROX_ALL (-1)
++
++/* Allowed modes for approximations.  */
++struct cpu_approx_modes
++{
++  const unsigned int division;		/* Division.  */
++  const unsigned int sqrt;		/* Square root.  */
++  const unsigned int recip_sqrt;	/* Reciprocal square root.  */
++};
++
+ struct tune_params
+ {
+   const struct cpu_cost_table *insn_extra_cost;
+@@ -185,6 +204,7 @@ struct tune_params
+   const struct cpu_regmove_cost *regmove_cost;
+   const struct cpu_vector_cost *vec_costs;
+   const struct cpu_branch_cost *branch_costs;
++  const struct cpu_approx_modes *approx_modes;
+   int memmov_cost;
+   int issue_rate;
+   unsigned int fusible_ops;
+@@ -287,9 +307,12 @@ bool aarch64_cannot_change_mode_class (machine_mode,
+ 				       enum reg_class);
+ bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
+ bool aarch64_constant_address_p (rtx);
++bool aarch64_emit_approx_div (rtx, rtx, rtx);
++bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
  bool aarch64_expand_movmem (rtx *);
  bool aarch64_float_const_zero_rtx_p (rtx);
  bool aarch64_function_arg_regno_p (unsigned);
@@ -153,7 +329,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  bool aarch64_gen_movmemqi (rtx *);
  bool aarch64_gimple_fold_builtin (gimple_stmt_iterator *);
  bool aarch64_is_extend_from_extract (machine_mode, rtx, rtx);
-@@ -335,11 +336,9 @@ machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
+@@ -335,11 +358,9 @@ machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
  						       machine_mode);
  int aarch64_hard_regno_mode_ok (unsigned, machine_mode);
  int aarch64_hard_regno_nregs (unsigned, machine_mode);
@@ -165,9 +341,41 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  rtx aarch64_mask_from_zextract_ops (rtx, rtx);
  const char *aarch64_output_move_struct (rtx *operands);
  rtx aarch64_return_addr (int, rtx);
+@@ -369,7 +390,6 @@ void aarch64_register_pragmas (void);
+ void aarch64_relayout_simd_types (void);
+ void aarch64_reset_previous_fndecl (void);
+ void aarch64_save_restore_target_globals (tree);
+-void aarch64_emit_approx_rsqrt (rtx, rtx);
+ 
+ /* Initialize builtins for SIMD intrinsics.  */
+ void init_aarch64_simd_builtins (void);
+--- a/src/gcc/config/aarch64/aarch64-simd-builtins.def
++++ b/src/gcc/config/aarch64/aarch64-simd-builtins.def
+@@ -449,3 +449,21 @@
+   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
+   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
+   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
++
++  /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
++  BUILTIN_VSDQ_SDI (SHIFTIMM, scvtf, 3)
++  BUILTIN_VSDQ_SDI (FCVTIMM_SUS, ucvtf, 3)
++  BUILTIN_VALLF (SHIFTIMM, fcvtzs, 3)
++  BUILTIN_VALLF (SHIFTIMM_USS, fcvtzu, 3)
++
++  /* Implemented by aarch64_rsqrte<mode>.  */
++  BUILTIN_VALLF (UNOP, rsqrte, 0)
++
++  /* Implemented by aarch64_rsqrts<mode>.  */
++  BUILTIN_VALLF (BINOP, rsqrts, 0)
++
++  /* Implemented by fabd<mode>3.  */
++  BUILTIN_VALLF (BINOP, fabd, 3)
++
++  /* Implemented by aarch64_faddp<mode>.  */
++  BUILTIN_VDQF (BINOP, faddp, 0)
 --- a/src/gcc/config/aarch64/aarch64-simd.md
 +++ b/src/gcc/config/aarch64/aarch64-simd.md
-@@ -371,15 +371,15 @@
+@@ -371,18 +371,18 @@
    [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
  )
  
@@ -190,8 +398,82 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 +  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
  )
  
- (define_insn "aarch64_rsqrte_<mode>2"
-@@ -1579,16 +1579,16 @@
+-(define_insn "aarch64_rsqrte_<mode>2"
++(define_insn "aarch64_rsqrte<mode>"
+   [(set (match_operand:VALLF 0 "register_operand" "=w")
+ 	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+ 		     UNSPEC_RSQRTE))]
+@@ -390,7 +390,7 @@
+   "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+   [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+ 
+-(define_insn "aarch64_rsqrts_<mode>3"
++(define_insn "aarch64_rsqrts<mode>"
+   [(set (match_operand:VALLF 0 "register_operand" "=w")
+ 	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+ 	       (match_operand:VALLF 2 "register_operand" "w")]
+@@ -405,7 +405,7 @@
+ 		     UNSPEC_RSQRT))]
+   "TARGET_SIMD"
+ {
+-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
++  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
+   DONE;
+ })
+ 
+@@ -474,23 +474,14 @@
+   [(set_attr "type" "neon_arith_acc<q>")]
+ )
+ 
+-(define_insn "fabd<mode>_3"
+-  [(set (match_operand:VDQF 0 "register_operand" "=w")
+-	(abs:VDQF (minus:VDQF
+-		   (match_operand:VDQF 1 "register_operand" "w")
+-		   (match_operand:VDQF 2 "register_operand" "w"))))]
+-  "TARGET_SIMD"
+-  "fabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+-  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
+-)
+-
+-(define_insn "*fabd_scalar<mode>3"
+-  [(set (match_operand:GPF 0 "register_operand" "=w")
+-        (abs:GPF (minus:GPF
+-                 (match_operand:GPF 1 "register_operand" "w")
+-                 (match_operand:GPF 2 "register_operand" "w"))))]
++(define_insn "fabd<mode>3"
++  [(set (match_operand:VALLF 0 "register_operand" "=w")
++	(abs:VALLF
++	  (minus:VALLF
++	    (match_operand:VALLF 1 "register_operand" "w")
++	    (match_operand:VALLF 2 "register_operand" "w"))))]
+   "TARGET_SIMD"
+-  "fabd\t%<s>0, %<s>1, %<s>2"
++  "fabd\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+   [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
+ )
+ 
+@@ -1509,7 +1500,19 @@
+   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
+ )
+ 
+-(define_insn "div<mode>3"
++(define_expand "div<mode>3"
++ [(set (match_operand:VDQF 0 "register_operand")
++       (div:VDQF (match_operand:VDQF 1 "general_operand")
++		 (match_operand:VDQF 2 "register_operand")))]
++ "TARGET_SIMD"
++{
++  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
++    DONE;
++
++  operands[1] = force_reg (<MODE>mode, operands[1]);
++})
++
++(define_insn "*div<mode>3"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
+ 		 (match_operand:VDQF 2 "register_operand" "w")))]
+@@ -1579,16 +1582,16 @@
    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
  )
  
@@ -217,7 +499,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*aarch64_fma4_elt_to_64v2df"
-@@ -1656,17 +1656,17 @@
+@@ -1656,17 +1659,17 @@
    [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
  )
  
@@ -246,7 +528,36 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*aarch64_fnma4_elt_to_64v2df"
-@@ -1979,19 +1979,6 @@
+@@ -1778,6 +1781,28 @@
+   [(set_attr "type" "neon_fp_cvt_widen_s")]
+ )
+ 
++;; Convert between fixed-point and floating-point (vector modes)
++
++(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><VDQF:mode>3"
++  [(set (match_operand:<VDQF:FCVT_TARGET> 0 "register_operand" "=w")
++	(unspec:<VDQF:FCVT_TARGET> [(match_operand:VDQF 1 "register_operand" "w")
++				    (match_operand:SI 2 "immediate_operand" "i")]
++	 FCVT_F2FIXED))]
++  "TARGET_SIMD"
++  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
++  [(set_attr "type" "neon_fp_to_int_<VDQF:Vetype><q>")]
++)
++
++(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_SDI:mode>3"
++  [(set (match_operand:<VDQ_SDI:FCVT_TARGET> 0 "register_operand" "=w")
++	(unspec:<VDQ_SDI:FCVT_TARGET> [(match_operand:VDQ_SDI 1 "register_operand" "w")
++				       (match_operand:SI 2 "immediate_operand" "i")]
++	 FCVT_FIXED2F))]
++  "TARGET_SIMD"
++  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
++  [(set_attr "type" "neon_int_to_fp_<VDQ_SDI:Vetype><q>")]
++)
++
+ ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
+ ;; is inconsistent with vector ordering elsewhere in the compiler, in that
+ ;; the meaning of HI and LO changes depending on the target endianness.
+@@ -1979,17 +2004,14 @@
    }
  )
  
@@ -261,12 +572,18 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 -    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
 -    DONE;
 -  }
--)
--
++(define_insn "aarch64_faddp<mode>"
++ [(set (match_operand:VDQF 0 "register_operand" "=w")
++       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
++		     (match_operand:VDQF 2 "register_operand" "w")]
++		     UNSPEC_FADDV))]
++ "TARGET_SIMD"
++ "faddp\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
++  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+ )
+ 
  (define_insn "aarch64_reduc_plus_internal<mode>"
-  [(set (match_operand:VDQV 0 "register_operand" "=w")
-        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
-@@ -2010,9 +1997,9 @@
+@@ -2010,24 +2032,15 @@
    [(set_attr "type" "neon_reduc_add")]
  )
  
@@ -279,7 +596,33 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  		   UNSPEC_FADDV))]
   "TARGET_SIMD"
   "faddp\\t%<Vetype>0, %1.<Vtype>"
-@@ -2635,7 +2622,7 @@
+   [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+ )
+ 
+-(define_insn "aarch64_addpv4sf"
+- [(set (match_operand:V4SF 0 "register_operand" "=w")
+-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
+-		    UNSPEC_FADDV))]
+- "TARGET_SIMD"
+- "faddp\\t%0.4s, %1.4s, %1.4s"
+-  [(set_attr "type" "neon_fp_reduc_add_s_q")]
+-)
+-
+ (define_expand "reduc_plus_scal_v4sf"
+  [(set (match_operand:SF 0 "register_operand")
+        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
+@@ -2036,8 +2049,8 @@
+ {
+   rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
+   rtx scratch = gen_reg_rtx (V4SFmode);
+-  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
+-  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
++  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
++  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
+   emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
+   DONE;
+ })
+@@ -2635,7 +2648,7 @@
  (define_insn "*aarch64_combinez<mode>"
    [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
          (vec_concat:<VDBL>
@@ -288,7 +631,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))]
    "TARGET_SIMD && !BYTES_BIG_ENDIAN"
    "@
-@@ -2651,7 +2638,7 @@
+@@ -2651,7 +2664,7 @@
    [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
          (vec_concat:<VDBL>
  	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")
@@ -297,7 +640,25 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    "TARGET_SIMD && BYTES_BIG_ENDIAN"
    "@
     mov\\t%0.8b, %1.8b
-@@ -4652,7 +4639,7 @@
+@@ -4297,7 +4310,16 @@
+ 
+ ;; sqrt
+ 
+-(define_insn "sqrt<mode>2"
++(define_expand "sqrt<mode>2"
++  [(set (match_operand:VDQF 0 "register_operand")
++	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
++  "TARGET_SIMD"
++{
++  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
++    DONE;
++})
++
++(define_insn "*sqrt<mode>2"
+   [(set (match_operand:VDQF 0 "register_operand" "=w")
+         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
+   "TARGET_SIMD"
+@@ -4652,7 +4674,7 @@
     ld1\\t{%S0.16b - %<Vendreg>0.16b}, %1"
    [(set_attr "type" "multiple,neon_store<nregs>_<nregs>reg_q,\
  		     neon_load<nregs>_<nregs>reg_q")
@@ -306,7 +667,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "aarch64_be_ld1<mode>"
-@@ -4685,7 +4672,7 @@
+@@ -4685,7 +4707,7 @@
     stp\\t%q1, %R1, %0
     ldp\\t%q0, %R0, %1"
    [(set_attr "type" "multiple,neon_stp_q,neon_ldp_q")
@@ -315,7 +676,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*aarch64_be_movci"
-@@ -4696,7 +4683,7 @@
+@@ -4696,7 +4718,7 @@
         || register_operand (operands[1], CImode))"
    "#"
    [(set_attr "type" "multiple")
@@ -324,7 +685,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*aarch64_be_movxi"
-@@ -4707,7 +4694,7 @@
+@@ -4707,7 +4729,7 @@
         || register_operand (operands[1], XImode))"
    "#"
    [(set_attr "type" "multiple")
@@ -333,7 +694,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_split
-@@ -5414,13 +5401,25 @@
+@@ -5414,13 +5436,25 @@
    [(set_attr "type" "crypto_aese")]
  )
  
@@ -362,9 +723,235 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  ;; sha1
+--- a/src/gcc/config/aarch64/aarch64-tune.md
++++ b/src/gcc/config/aarch64/aarch64-tune.md
+@@ -1,5 +1,5 @@
+ ;; -*- buffer-read-only: t -*-
+ ;; Generated automatically by gentune.sh from aarch64-cores.def
+ (define_attr "tune"
+-	"cortexa35,cortexa53,cortexa57,cortexa72,exynosm1,qdf24xx,thunderx,xgene1,cortexa57cortexa53,cortexa72cortexa53"
++	"cortexa35,cortexa53,cortexa57,cortexa72,exynosm1,qdf24xx,thunderx,xgene1,vulcan,cortexa57cortexa53,cortexa72cortexa53"
+ 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
+--- a/src/gcc/config/aarch64/aarch64-tuning-flags.def
++++ b/src/gcc/config/aarch64/aarch64-tuning-flags.def
+@@ -29,5 +29,3 @@
+      AARCH64_TUNE_ to give an enum name. */
+ 
+ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
+-
 --- a/src/gcc/config/aarch64/aarch64.c
 +++ b/src/gcc/config/aarch64/aarch64.c
-@@ -3582,7 +3582,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+@@ -250,6 +250,22 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
+   0, /* imm_offset  */
+ };
+ 
++static const struct cpu_addrcost_table vulcan_addrcost_table =
++{
++    {
++      0, /* hi  */
++      0, /* si  */
++      0, /* di  */
++      2, /* ti  */
++    },
++  0, /* pre_modify  */
++  0, /* post_modify  */
++  2, /* register_offset  */
++  3, /* register_sextend  */
++  3, /* register_zextend  */
++  0, /* imm_offset  */
++};
++
+ static const struct cpu_regmove_cost generic_regmove_cost =
+ {
+   1, /* GP2GP  */
+@@ -308,6 +324,15 @@ static const struct cpu_regmove_cost xgene1_regmove_cost =
+   2 /* FP2FP  */
+ };
+ 
++static const struct cpu_regmove_cost vulcan_regmove_cost =
++{
++  1, /* GP2GP  */
++  /* Avoid the use of int<->fp moves for spilling.  */
++  8, /* GP2FP  */
++  8, /* FP2GP  */
++  4  /* FP2FP  */
++};
++
+ /* Generic costs for vector insn classes.  */
+ static const struct cpu_vector_cost generic_vector_cost =
+ {
+@@ -379,6 +404,24 @@ static const struct cpu_vector_cost xgene1_vector_cost =
+   1 /* cond_not_taken_branch_cost  */
+ };
+ 
++/* Costs for vector insn classes for Vulcan.  */
++static const struct cpu_vector_cost vulcan_vector_cost =
++{
++  6, /* scalar_stmt_cost  */
++  4, /* scalar_load_cost  */
++  1, /* scalar_store_cost  */
++  6, /* vec_stmt_cost  */
++  3, /* vec_permute_cost  */
++  6, /* vec_to_scalar_cost  */
++  5, /* scalar_to_vec_cost  */
++  8, /* vec_align_load_cost  */
++  8, /* vec_unalign_load_cost  */
++  4, /* vec_unalign_store_cost  */
++  4, /* vec_store_cost  */
++  2, /* cond_taken_branch_cost  */
++  1  /* cond_not_taken_branch_cost  */
++};
++
+ /* Generic costs for branch instructions.  */
+ static const struct cpu_branch_cost generic_branch_cost =
+ {
+@@ -393,6 +436,37 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
+   3   /* Unpredictable.  */
+ };
+ 
++/* Branch costs for Vulcan.  */
++static const struct cpu_branch_cost vulcan_branch_cost =
++{
++  1,  /* Predictable.  */
++  3   /* Unpredictable.  */
++};
++
++/* Generic approximation modes.  */
++static const cpu_approx_modes generic_approx_modes =
++{
++  AARCH64_APPROX_NONE,	/* division  */
++  AARCH64_APPROX_NONE,	/* sqrt  */
++  AARCH64_APPROX_NONE	/* recip_sqrt  */
++};
++
++/* Approximation modes for Exynos M1.  */
++static const cpu_approx_modes exynosm1_approx_modes =
++{
++  AARCH64_APPROX_NONE,	/* division  */
++  AARCH64_APPROX_ALL,	/* sqrt  */
++  AARCH64_APPROX_ALL	/* recip_sqrt  */
++};
++
++/* Approximation modes for X-Gene 1.  */
++static const cpu_approx_modes xgene1_approx_modes =
++{
++  AARCH64_APPROX_NONE,	/* division  */
++  AARCH64_APPROX_NONE,	/* sqrt  */
++  AARCH64_APPROX_ALL	/* recip_sqrt  */
++};
++
+ static const struct tune_params generic_tunings =
+ {
+   &cortexa57_extra_costs,
+@@ -400,6 +474,7 @@ static const struct tune_params generic_tunings =
+   &generic_regmove_cost,
+   &generic_vector_cost,
+   &generic_branch_cost,
++  &generic_approx_modes,
+   4, /* memmov_cost  */
+   2, /* issue_rate  */
+   AARCH64_FUSE_NOTHING, /* fusible_ops  */
+@@ -424,6 +499,7 @@ static const struct tune_params cortexa35_tunings =
+   &cortexa53_regmove_cost,
+   &generic_vector_cost,
+   &generic_branch_cost,
++  &generic_approx_modes,
+   4, /* memmov_cost  */
+   1, /* issue_rate  */
+   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+@@ -449,6 +525,7 @@ static const struct tune_params cortexa53_tunings =
+   &cortexa53_regmove_cost,
+   &generic_vector_cost,
+   &generic_branch_cost,
++  &generic_approx_modes,
+   4, /* memmov_cost  */
+   2, /* issue_rate  */
+   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+@@ -474,6 +551,7 @@ static const struct tune_params cortexa57_tunings =
+   &cortexa57_regmove_cost,
+   &cortexa57_vector_cost,
+   &cortexa57_branch_cost,
++  &generic_approx_modes,
+   4, /* memmov_cost  */
+   3, /* issue_rate  */
+   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+@@ -499,6 +577,7 @@ static const struct tune_params cortexa72_tunings =
+   &cortexa57_regmove_cost,
+   &cortexa57_vector_cost,
+   &generic_branch_cost,
++  &generic_approx_modes,
+   4, /* memmov_cost  */
+   3, /* issue_rate  */
+   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+@@ -524,6 +603,7 @@ static const struct tune_params exynosm1_tunings =
+   &exynosm1_regmove_cost,
+   &exynosm1_vector_cost,
+   &generic_branch_cost,
++  &exynosm1_approx_modes,
+   4,	/* memmov_cost  */
+   3,	/* issue_rate  */
+   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+@@ -538,7 +618,7 @@ static const struct tune_params exynosm1_tunings =
+   48,	/* max_case_values.  */
+   64,	/* cache_line_size.  */
+   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
+-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
++  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
+ };
+ 
+ static const struct tune_params thunderx_tunings =
+@@ -548,6 +628,7 @@ static const struct tune_params thunderx_tunings =
+   &thunderx_regmove_cost,
+   &generic_vector_cost,
+   &generic_branch_cost,
++  &generic_approx_modes,
+   6, /* memmov_cost  */
+   2, /* issue_rate  */
+   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
+@@ -572,6 +653,7 @@ static const struct tune_params xgene1_tunings =
+   &xgene1_regmove_cost,
+   &xgene1_vector_cost,
+   &generic_branch_cost,
++  &xgene1_approx_modes,
+   6, /* memmov_cost  */
+   4, /* issue_rate  */
+   AARCH64_FUSE_NOTHING, /* fusible_ops  */
+@@ -586,7 +668,32 @@ static const struct tune_params xgene1_tunings =
+   0,	/* max_case_values.  */
+   0,	/* cache_line_size.  */
+   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
++  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
++};
++
++static const struct tune_params vulcan_tunings =
++{
++  &vulcan_extra_costs,
++  &vulcan_addrcost_table,
++  &vulcan_regmove_cost,
++  &vulcan_vector_cost,
++  &vulcan_branch_cost,
++  &generic_approx_modes,
++  4, /* memmov_cost.  */
++  4, /* issue_rate.  */
++  AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
++  16,	/* function_align.  */
++  8,	/* jump_align.  */
++  16,	/* loop_align.  */
++  3,	/* int_reassoc_width.  */
++  2,	/* fp_reassoc_width.  */
++  2,	/* vec_reassoc_width.  */
++  2,	/* min_div_recip_mul_sf.  */
++  2,	/* min_div_recip_mul_df.  */
++  0,	/* max_case_values.  */
++  64,	/* cache_line_size.  */
++  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
++  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+ };
+ 
+ /* Support for fine-grained override of the tuning structures.  */
+@@ -3582,7 +3689,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
    return aarch64_tls_referenced_p (x);
  }
  
@@ -378,7 +965,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  
  static unsigned int
  aarch64_case_values_threshold (void)
-@@ -3593,7 +3598,7 @@ aarch64_case_values_threshold (void)
+@@ -3593,7 +3705,7 @@ aarch64_case_values_threshold (void)
        && selected_cpu->tune->max_case_values != 0)
      return selected_cpu->tune->max_case_values;
    else
@@ -387,7 +974,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  }
  
  /* Return true if register REGNO is a valid index register.
-@@ -4232,14 +4237,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
+@@ -4232,14 +4344,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
        && GET_CODE (x) == NEG)
      return CC_Zmode;
  
@@ -402,7 +989,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    /* A test for unsigned overflow.  */
    if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
        && code == NE
-@@ -4308,8 +4305,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
+@@ -4308,8 +4412,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
        break;
  
      case CC_SWPmode:
@@ -411,7 +998,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
        switch (comp_code)
  	{
  	case NE: return AARCH64_NE;
-@@ -5022,120 +5017,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
+@@ -5022,120 +5124,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
    return x;
  }
  
@@ -532,7 +1119,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  /* Return the reload icode required for a constant pool in mode.  */
  static enum insn_code
  aarch64_constant_pool_reload_icode (machine_mode mode)
-@@ -6411,10 +6292,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
+@@ -6411,10 +6399,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
            /* TODO: A write to the CC flags possibly costs extra, this
  	     needs encoding in the cost tables.  */
  
@@ -543,7 +1130,390 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  	  mode = GET_MODE (op0);
            /* ANDS.  */
            if (GET_CODE (op0) == AND)
-@@ -10851,33 +10728,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
+@@ -7452,12 +7436,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
+    to optimize 1.0/sqrt.  */
+ 
+ static bool
+-use_rsqrt_p (void)
++use_rsqrt_p (machine_mode mode)
+ {
+   return (!flag_trapping_math
+ 	  && flag_unsafe_math_optimizations
+-	  && ((aarch64_tune_params.extra_tuning_flags
+-	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
++	  && ((aarch64_tune_params.approx_modes->recip_sqrt
++	       & AARCH64_APPROX_MODE (mode))
+ 	      || flag_mrecip_low_precision_sqrt));
+ }
+ 
+@@ -7467,89 +7451,217 @@ use_rsqrt_p (void)
+ static tree
+ aarch64_builtin_reciprocal (tree fndecl)
+ {
+-  if (!use_rsqrt_p ())
++  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
++
++  if (!use_rsqrt_p (mode))
+     return NULL_TREE;
+   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
+ }
+ 
+ typedef rtx (*rsqrte_type) (rtx, rtx);
+ 
+-/* Select reciprocal square root initial estimate
+-   insn depending on machine mode.  */
++/* Select reciprocal square root initial estimate insn depending on machine
++   mode.  */
+ 
+-rsqrte_type
++static rsqrte_type
+ get_rsqrte_type (machine_mode mode)
+ {
+   switch (mode)
+   {
+-    case DFmode:   return gen_aarch64_rsqrte_df2;
+-    case SFmode:   return gen_aarch64_rsqrte_sf2;
+-    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+-    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+-    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
++    case DFmode:   return gen_aarch64_rsqrtedf;
++    case SFmode:   return gen_aarch64_rsqrtesf;
++    case V2DFmode: return gen_aarch64_rsqrtev2df;
++    case V2SFmode: return gen_aarch64_rsqrtev2sf;
++    case V4SFmode: return gen_aarch64_rsqrtev4sf;
+     default: gcc_unreachable ();
+   }
+ }
+ 
+ typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+ 
+-/* Select reciprocal square root Newton-Raphson step
+-   insn depending on machine mode.  */
++/* Select reciprocal square root series step insn depending on machine mode.  */
+ 
+-rsqrts_type
++static rsqrts_type
+ get_rsqrts_type (machine_mode mode)
+ {
+   switch (mode)
+   {
+-    case DFmode:   return gen_aarch64_rsqrts_df3;
+-    case SFmode:   return gen_aarch64_rsqrts_sf3;
+-    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+-    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+-    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
++    case DFmode:   return gen_aarch64_rsqrtsdf;
++    case SFmode:   return gen_aarch64_rsqrtssf;
++    case V2DFmode: return gen_aarch64_rsqrtsv2df;
++    case V2SFmode: return gen_aarch64_rsqrtsv2sf;
++    case V4SFmode: return gen_aarch64_rsqrtsv4sf;
+     default: gcc_unreachable ();
+   }
+ }
+ 
+-/* Emit instruction sequence to compute the reciprocal square root using the
+-   Newton-Raphson series.  Iterate over the series twice for SF
+-   and thrice for DF.  */
++/* Emit instruction sequence to compute either the approximate square root
++   or its approximate reciprocal, depending on the flag RECP, and return
++   whether the sequence was emitted or not.  */
+ 
+-void
+-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
++bool
++aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
+ {
+-  machine_mode mode = GET_MODE (src);
+-  gcc_assert (
+-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+-	|| mode == DFmode || mode == V2DFmode);
+-
+-  rtx xsrc = gen_reg_rtx (mode);
+-  emit_move_insn (xsrc, src);
+-  rtx x0 = gen_reg_rtx (mode);
++  machine_mode mode = GET_MODE (dst);
++  machine_mode mmsk = mode_for_vector
++		        (int_mode_for_mode (GET_MODE_INNER (mode)),
++			 GET_MODE_NUNITS (mode));
++  bool use_approx_sqrt_p = (!recp
++			    && (flag_mlow_precision_sqrt
++			        || (aarch64_tune_params.approx_modes->sqrt
++				    & AARCH64_APPROX_MODE (mode))));
++  bool use_approx_rsqrt_p = (recp
++			     && (flag_mrecip_low_precision_sqrt
++				 || (aarch64_tune_params.approx_modes->recip_sqrt
++				     & AARCH64_APPROX_MODE (mode))));
++
++  if (!flag_finite_math_only
++      || flag_trapping_math
++      || !flag_unsafe_math_optimizations
++      || !(use_approx_sqrt_p || use_approx_rsqrt_p)
++      || optimize_function_for_size_p (cfun))
++    return false;
+ 
+-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
++  rtx xmsk = gen_reg_rtx (mmsk);
++  if (!recp)
++    /* When calculating the approximate square root, compare the argument with
++       0.0 and create a mask.  */
++    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
++							  CONST0_RTX (mode)))));
+ 
+-  bool double_mode = (mode == DFmode || mode == V2DFmode);
++  /* Estimate the approximate reciprocal square root.  */
++  rtx xdst = gen_reg_rtx (mode);
++  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
+ 
+-  int iterations = double_mode ? 3 : 2;
++  /* Iterate over the series twice for SF and thrice for DF.  */
++  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+ 
+-  /* Optionally iterate over the series one less time than otherwise.  */
+-  if (flag_mrecip_low_precision_sqrt)
++  /* Optionally iterate over the series once less for faster performance
++     while sacrificing the accuracy.  */
++  if ((recp && flag_mrecip_low_precision_sqrt)
++      || (!recp && flag_mlow_precision_sqrt))
+     iterations--;
+ 
+-  for (int i = 0; i < iterations; ++i)
++  /* Iterate over the series to calculate the approximate reciprocal square
++     root.  */
++  rtx x1 = gen_reg_rtx (mode);
++  while (iterations--)
+     {
+-      rtx x1 = gen_reg_rtx (mode);
+       rtx x2 = gen_reg_rtx (mode);
+-      rtx x3 = gen_reg_rtx (mode);
+-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
++      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
++
++      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
+ 
+-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
++      if (iterations > 0)
++	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
++    }
++
++  if (!recp)
++    {
++      /* Qualify the approximate reciprocal square root when the argument is
++	 0.0 by squashing the intermediary result to 0.0.  */
++      rtx xtmp = gen_reg_rtx (mmsk);
++      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
++					      gen_rtx_SUBREG (mmsk, xdst, 0)));
++      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+ 
+-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+-      x0 = x1;
++      /* Calculate the approximate square root.  */
++      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
+     }
+ 
+-  emit_move_insn (dst, x0);
++  /* Finalize the approximation.  */
++  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
++
++  return true;
++}
++
++typedef rtx (*recpe_type) (rtx, rtx);
++
++/* Select reciprocal initial estimate insn depending on machine mode.  */
++
++static recpe_type
++get_recpe_type (machine_mode mode)
++{
++  switch (mode)
++  {
++    case SFmode:   return (gen_aarch64_frecpesf);
++    case V2SFmode: return (gen_aarch64_frecpev2sf);
++    case V4SFmode: return (gen_aarch64_frecpev4sf);
++    case DFmode:   return (gen_aarch64_frecpedf);
++    case V2DFmode: return (gen_aarch64_frecpev2df);
++    default:       gcc_unreachable ();
++  }
++}
++
++typedef rtx (*recps_type) (rtx, rtx, rtx);
++
++/* Select reciprocal series step insn depending on machine mode.  */
++
++static recps_type
++get_recps_type (machine_mode mode)
++{
++  switch (mode)
++  {
++    case SFmode:   return (gen_aarch64_frecpssf);
++    case V2SFmode: return (gen_aarch64_frecpsv2sf);
++    case V4SFmode: return (gen_aarch64_frecpsv4sf);
++    case DFmode:   return (gen_aarch64_frecpsdf);
++    case V2DFmode: return (gen_aarch64_frecpsv2df);
++    default:       gcc_unreachable ();
++  }
++}
++
++/* Emit the instruction sequence to compute the approximation for the division
++   of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
++
++bool
++aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
++{
++  machine_mode mode = GET_MODE (quo);
++  bool use_approx_division_p = (flag_mlow_precision_div
++			        || (aarch64_tune_params.approx_modes->division
++				    & AARCH64_APPROX_MODE (mode)));
++
++  if (!flag_finite_math_only
++      || flag_trapping_math
++      || !flag_unsafe_math_optimizations
++      || optimize_function_for_size_p (cfun)
++      || !use_approx_division_p)
++    return false;
++
++  /* Estimate the approximate reciprocal.  */
++  rtx xrcp = gen_reg_rtx (mode);
++  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
++
++  /* Iterate over the series twice for SF and thrice for DF.  */
++  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
++
++  /* Optionally iterate over the series once less for faster performance,
++     while sacrificing the accuracy.  */
++  if (flag_mlow_precision_div)
++    iterations--;
++
++  /* Iterate over the series to calculate the approximate reciprocal.  */
++  rtx xtmp = gen_reg_rtx (mode);
++  while (iterations--)
++    {
++      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
++
++      if (iterations > 0)
++	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
++    }
++
++  if (num != CONST1_RTX (mode))
++    {
++      /* As the approximate reciprocal of DEN is already calculated, only
++	 calculate the approximate division when NUM is not 1.0.  */
++      rtx xnum = force_reg (mode, num);
++      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
++    }
++
++  /* Finalize the approximation.  */
++  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
++  return true;
+ }
+ 
+ /* Return the number of instructions that can be issued per cycle.  */
+@@ -8079,6 +8191,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
+       && (aarch64_cmodel == AARCH64_CMODEL_TINY
+ 	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
+     aarch64_nopcrelative_literal_loads = false;
++
++  /* When enabling the lower precision Newton series for the square root, also
++     enable it for the reciprocal square root, since the latter is an
++     intermediary step for the former.  */
++  if (flag_mlow_precision_sqrt)
++    flag_mrecip_low_precision_sqrt = true;
+ }
+ 
+ /* 'Unpack' up the internal tuning structs and update the options
+@@ -9463,6 +9581,13 @@ aarch64_build_builtin_va_list (void)
+ 			FIELD_DECL, get_identifier ("__vr_offs"),
+ 			integer_type_node);
+ 
++  /* Tell tree-stdarg pass about our internal offset fields.
++     NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
++     purpose to identify whether the code is updating va_list internal
++     offset fields through irregular way.  */
++  va_list_gpr_counter_field = f_groff;
++  va_list_fpr_counter_field = f_vroff;
++
+   DECL_ARTIFICIAL (f_stack) = 1;
+   DECL_ARTIFICIAL (f_grtop) = 1;
+   DECL_ARTIFICIAL (f_vrtop) = 1;
+@@ -9495,15 +9620,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
+   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
+   tree stack, grtop, vrtop, groff, vroff;
+   tree t;
+-  int gr_save_area_size;
+-  int vr_save_area_size;
++  int gr_save_area_size = cfun->va_list_gpr_size;
++  int vr_save_area_size = cfun->va_list_fpr_size;
+   int vr_offset;
+ 
+   cum = &crtl->args.info;
+-  gr_save_area_size
+-    = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
+-  vr_save_area_size
+-    = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
++  if (cfun->va_list_gpr_size)
++    gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
++			     cfun->va_list_gpr_size);
++  if (cfun->va_list_fpr_size)
++    vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
++			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
+ 
+   if (!TARGET_FLOAT)
+     {
+@@ -9832,7 +9959,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+ {
+   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+   CUMULATIVE_ARGS local_cum;
+-  int gr_saved, vr_saved;
++  int gr_saved = cfun->va_list_gpr_size;
++  int vr_saved = cfun->va_list_fpr_size;
+ 
+   /* The caller has advanced CUM up to, but not beyond, the last named
+      argument.  Advance a local copy of CUM past the last "real" named
+@@ -9840,9 +9968,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+   local_cum = *cum;
+   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
+ 
+-  /* Found out how many registers we need to save.  */
+-  gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
+-  vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
++  /* Found out how many registers we need to save.
++     Honor tree-stdvar analysis results.  */
++  if (cfun->va_list_gpr_size)
++    gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
++		    cfun->va_list_gpr_size / UNITS_PER_WORD);
++  if (cfun->va_list_fpr_size)
++    vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
++		    cfun->va_list_fpr_size / UNITS_PER_VREG);
+ 
+   if (!TARGET_FLOAT)
+     {
+@@ -9870,7 +10003,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+ 	  /* We can't use move_block_from_reg, because it will use
+ 	     the wrong mode, storing D regs only.  */
+ 	  machine_mode mode = TImode;
+-	  int off, i;
++	  int off, i, vr_start;
+ 
+ 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
+ 	     the first vector register.  The VR save area lies below
+@@ -9879,14 +10012,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+ 	  off -= vr_saved * UNITS_PER_VREG;
+ 
+-	  for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
++	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
++	  for (i = 0; i < vr_saved; ++i)
+ 	    {
+ 	      rtx ptr, mem;
+ 
+ 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
+ 	      mem = gen_frame_mem (mode, ptr);
+ 	      set_mem_alias_set (mem, get_varargs_alias_set ());
+-	      aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
++	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
+ 	      off += UNITS_PER_VREG;
+ 	    }
+ 	}
+@@ -10848,33 +10982,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
  		      gen_rtx_REG (mode, rsrc + count - i - 1));
  }
  
@@ -577,7 +1547,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
     one of VSTRUCT modes: OI, CI, or XI.  */
  int
-@@ -11959,12 +11809,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
+@@ -11956,12 +12063,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
          info.value = GEN_INT (0);
        else
  	{
@@ -591,7 +1561,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  
  	  if (lane_count == 1)
  	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
-@@ -13317,6 +13166,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
+@@ -13314,6 +13420,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
    return false;
  }
  
@@ -606,7 +1576,23 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  /* If MEM is in the form of [base+offset], extract the two parts
     of address and set to BASE and OFFSET, otherwise return false
     after clearing BASE and OFFSET.  */
-@@ -14232,6 +14089,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+@@ -13886,13 +14000,13 @@ aarch64_promoted_type (const_tree t)
+ /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
+ 
+ static bool
+-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
++aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
+ 			   optimization_type opt_type)
+ {
+   switch (op)
+     {
+     case rsqrt_optab:
+-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
++      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
+ 
+     default:
+       return true;
+@@ -14229,6 +14343,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
  #undef TARGET_OPTAB_SUPPORTED_P
  #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
  
@@ -651,7 +1637,114 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
     required size of load/store.  */
 --- a/src/gcc/config/aarch64/aarch64.md
 +++ b/src/gcc/config/aarch64/aarch64.md
-@@ -1783,7 +1783,7 @@
+@@ -75,6 +75,8 @@
+     UNSPEC_CRC32H
+     UNSPEC_CRC32W
+     UNSPEC_CRC32X
++    UNSPEC_FCVTZS
++    UNSPEC_FCVTZU
+     UNSPEC_URECPE
+     UNSPEC_FRECPE
+     UNSPEC_FRECPS
+@@ -105,6 +107,7 @@
+     UNSPEC_NOP
+     UNSPEC_PRLG_STK
+     UNSPEC_RBIT
++    UNSPEC_SCVTF
+     UNSPEC_SISD_NEG
+     UNSPEC_SISD_SSHL
+     UNSPEC_SISD_USHL
+@@ -122,6 +125,7 @@
+     UNSPEC_TLSLE24
+     UNSPEC_TLSLE32
+     UNSPEC_TLSLE48
++    UNSPEC_UCVTF
+     UNSPEC_USHL_2S
+     UNSPEC_VSTRUCTDUMMY
+     UNSPEC_SP_SET
+@@ -1178,11 +1182,12 @@
+ )
+ 
+ (define_insn "*movhf_aarch64"
+-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w, ?r,w,w,m,r,m ,r")
+-	(match_operand:HF 1 "general_operand"      "?rY, w,w,m,w,m,rY,r"))]
++  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
++	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
+   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
+     || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+   "@
++   movi\\t%0.4h, #0
+    mov\\t%0.h[0], %w1
+    umov\\t%w0, %1.h[0]
+    mov\\t%0.h[0], %1.h[0]
+@@ -1191,18 +1196,18 @@
+    ldrh\\t%w0, %1
+    strh\\t%w1, %0
+    mov\\t%w0, %w1"
+-  [(set_attr "type" "neon_from_gp,neon_to_gp,neon_move,\
++  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
+                      f_loads,f_stores,load1,store1,mov_reg")
+-   (set_attr "simd" "yes,yes,yes,*,*,*,*,*")
+-   (set_attr "fp"   "*,*,*,yes,yes,*,*,*")]
++   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
+ )
+ 
+ (define_insn "*movsf_aarch64"
+-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
+-	(match_operand:SF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
++  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
++	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
+     || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+   "@
++   movi\\t%0.2s, #0
+    fmov\\t%s0, %w1
+    fmov\\t%w0, %s1
+    fmov\\t%s0, %s1
+@@ -1212,16 +1217,18 @@
+    ldr\\t%w0, %1
+    str\\t%w1, %0
+    mov\\t%w0, %w1"
+-  [(set_attr "type" "f_mcr,f_mrc,fmov,fconsts,\
+-                     f_loads,f_stores,load1,store1,mov_reg")]
++  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
++                     f_loads,f_stores,load1,store1,mov_reg")
++   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+ )
+ 
+ (define_insn "*movdf_aarch64"
+-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
+-	(match_operand:DF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
++  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
++	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
+     || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+   "@
++   movi\\t%d0, #0
+    fmov\\t%d0, %x1
+    fmov\\t%x0, %d1
+    fmov\\t%d0, %d1
+@@ -1231,8 +1238,9 @@
+    ldr\\t%x0, %1
+    str\\t%x1, %0
+    mov\\t%x0, %x1"
+-  [(set_attr "type" "f_mcr,f_mrc,fmov,fconstd,\
+-                     f_loadd,f_stored,load1,store1,mov_reg")]
++  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
++                     f_loadd,f_stored,load1,store1,mov_reg")
++   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+ )
+ 
+ (define_insn "*movtf_aarch64"
+@@ -1257,7 +1265,6 @@
+   [(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
+                      f_loadd,f_stored,load2,store2,store2")
+    (set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
+-   (set_attr "fp" "*,*,yes,yes,*,yes,yes,yes,*,*,*")
+    (set_attr "simd" "yes,*,*,*,yes,*,*,*,*,*,*")]
+ )
+ 
+@@ -1783,7 +1790,7 @@
    "aarch64_zero_extend_const_eq (<DWI>mode, operands[2],
  				 <MODE>mode, operands[1])"
    "@
@@ -660,7 +1753,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    cmp\\t%<w>0, #%n1"
    [(set_attr "type" "alus_imm")]
  )
-@@ -1815,11 +1815,11 @@
+@@ -1815,11 +1822,11 @@
    "aarch64_zero_extend_const_eq (<DWI>mode, operands[3],
                                   <MODE>mode, operands[2])"
    "@
@@ -674,7 +1767,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  (define_insn "add<mode>3_compareC"
    [(set (reg:CC_C CC_REGNUM)
  	(ne:CC_C
-@@ -3422,7 +3422,9 @@
+@@ -3422,7 +3429,9 @@
           (LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
  		     (match_operand:SI 2 "aarch64_logical_operand" "r,K"))))]
    ""
@@ -685,7 +1778,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    [(set_attr "type" "logic_reg,logic_imm")]
  )
  
-@@ -3435,7 +3437,9 @@
+@@ -3435,7 +3444,9 @@
     (set (match_operand:GPI 0 "register_operand" "=r,r")
  	(and:GPI (match_dup 1) (match_dup 2)))]
    ""
@@ -696,7 +1789,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    [(set_attr "type" "logics_reg,logics_imm")]
  )
  
-@@ -3449,7 +3453,9 @@
+@@ -3449,7 +3460,9 @@
     (set (match_operand:DI 0 "register_operand" "=r,r")
  	(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
    ""
@@ -707,7 +1800,40 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    [(set_attr "type" "logics_reg,logics_imm")]
  )
  
-@@ -3803,7 +3809,9 @@
+@@ -3775,16 +3788,23 @@
+   [(set_attr "type" "rbit")]
+ )
+ 
+-(define_expand "ctz<mode>2"
+-  [(match_operand:GPI 0 "register_operand")
+-   (match_operand:GPI 1 "register_operand")]
++;; Split after reload into RBIT + CLZ.  Since RBIT is represented as an UNSPEC
++;; it is unlikely to fold with any other operation, so keep this as a CTZ
++;; expression and split after reload to enable scheduling them apart if
++;; needed.
++
++(define_insn_and_split "ctz<mode>2"
++ [(set (match_operand:GPI           0 "register_operand" "=r")
++       (ctz:GPI (match_operand:GPI  1 "register_operand" "r")))]
+   ""
+-  {
+-    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
+-    emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
+-    DONE;
+-  }
+-)
++  "#"
++  "reload_completed"
++  [(const_int 0)]
++  "
++  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
++  emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
++  DONE;
++")
+ 
+ (define_insn "*and<mode>_compare0"
+   [(set (reg:CC_NZ CC_REGNUM)
+@@ -3803,7 +3823,9 @@
  		  (match_operand:GPI 1 "aarch64_logical_operand" "r,<lconst>"))
  	 (const_int 0)))]
    ""
@@ -718,7 +1844,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    [(set_attr "type" "logics_reg,logics_imm")]
  )
  
-@@ -3869,22 +3877,16 @@
+@@ -3869,22 +3891,16 @@
  (define_expand "ashl<mode>3"
    [(set (match_operand:SHORT 0 "register_operand")
  	(ashift:SHORT (match_operand:SHORT 1 "register_operand")
@@ -747,7 +1873,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    }
  )
  
-@@ -3933,33 +3935,35 @@
+@@ -3933,33 +3949,35 @@
  
  ;; Logical left shift using SISD or Integer instruction
  (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
@@ -795,7 +1921,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_split
-@@ -3994,18 +3998,19 @@
+@@ -3994,18 +4012,19 @@
  
  ;; Arithmetic right shift using SISD or Integer instruction
  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
@@ -820,7 +1946,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_split
-@@ -4097,21 +4102,25 @@
+@@ -4097,21 +4116,25 @@
    [(set (match_operand:GPI 0 "register_operand" "=r,r")
       (rotatert:GPI
         (match_operand:GPI 1 "register_operand" "r,r")
@@ -854,7 +1980,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*<optab><mode>3_insn"
-@@ -4135,7 +4144,7 @@
+@@ -4135,7 +4158,7 @@
    "UINTVAL (operands[3]) < GET_MODE_BITSIZE (<MODE>mode) &&
     (UINTVAL (operands[3]) + UINTVAL (operands[4]) == GET_MODE_BITSIZE (<MODE>mode))"
    "extr\\t%<w>0, %<w>1, %<w>2, %4"
@@ -863,7 +1989,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  ;; There are no canonicalisation rules for ashift and lshiftrt inside an ior
-@@ -4150,7 +4159,7 @@
+@@ -4150,7 +4173,7 @@
     && (UINTVAL (operands[3]) + UINTVAL (operands[4])
         == GET_MODE_BITSIZE (<MODE>mode))"
    "extr\\t%<w>0, %<w>1, %<w>2, %4"
@@ -872,7 +1998,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  ;; zero_extend version of the above
-@@ -4164,7 +4173,7 @@
+@@ -4164,7 +4187,7 @@
    "UINTVAL (operands[3]) < 32 &&
     (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
    "extr\\t%w0, %w1, %w2, %4"
@@ -881,7 +2007,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*extrsi5_insn_uxtw_alt"
-@@ -4177,7 +4186,7 @@
+@@ -4177,7 +4200,7 @@
    "UINTVAL (operands[3]) < 32 &&
     (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
    "extr\\t%w0, %w1, %w2, %4"
@@ -890,7 +2016,89 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "*ror<mode>3_insn"
-@@ -5191,7 +5200,7 @@
+@@ -4608,6 +4631,36 @@
+   [(set_attr "type" "f_cvti2f")]
+ )
+ 
++;; Convert between fixed-point and floating-point (scalar modes)
++
++(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3"
++  [(set (match_operand:<GPF:FCVT_TARGET> 0 "register_operand" "=r, w")
++	(unspec:<GPF:FCVT_TARGET> [(match_operand:GPF 1 "register_operand" "w, w")
++				   (match_operand:SI 2 "immediate_operand" "i, i")]
++	 FCVT_F2FIXED))]
++  ""
++  "@
++   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:w1>0, %<GPF:s>1, #%2
++   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:s>0, %<GPF:s>1, #%2"
++  [(set_attr "type" "f_cvtf2i, neon_fp_to_int_<GPF:Vetype>")
++   (set_attr "fp" "yes, *")
++   (set_attr "simd" "*, yes")]
++)
++
++(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
++  [(set (match_operand:<GPI:FCVT_TARGET> 0 "register_operand" "=w, w")
++	(unspec:<GPI:FCVT_TARGET> [(match_operand:GPI 1 "register_operand" "r, w")
++				   (match_operand:SI 2 "immediate_operand" "i, i")]
++	 FCVT_FIXED2F))]
++  ""
++  "@
++   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:w>1, #%2
++   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:v>1, #%2"
++  [(set_attr "type" "f_cvti2f, neon_int_to_fp_<GPI:Vetype>")
++   (set_attr "fp" "yes, *")
++   (set_attr "simd" "*, yes")]
++)
++
+ ;; -------------------------------------------------------------------
+ ;; Floating-point arithmetic
+ ;; -------------------------------------------------------------------
+@@ -4662,11 +4715,22 @@
+   [(set_attr "type" "fmul<s>")]
+ )
+ 
+-(define_insn "div<mode>3"
++(define_expand "div<mode>3"
++ [(set (match_operand:GPF 0 "register_operand")
++       (div:GPF (match_operand:GPF 1 "general_operand")
++		(match_operand:GPF 2 "register_operand")))]
++ "TARGET_SIMD"
++{
++  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
++    DONE;
++
++  operands[1] = force_reg (<MODE>mode, operands[1]);
++})
++
++(define_insn "*div<mode>3"
+   [(set (match_operand:GPF 0 "register_operand" "=w")
+-        (div:GPF
+-         (match_operand:GPF 1 "register_operand" "w")
+-         (match_operand:GPF 2 "register_operand" "w")))]
++        (div:GPF (match_operand:GPF 1 "register_operand" "w")
++	         (match_operand:GPF 2 "register_operand" "w")))]
+   "TARGET_FLOAT"
+   "fdiv\\t%<s>0, %<s>1, %<s>2"
+   [(set_attr "type" "fdiv<s>")]
+@@ -4680,7 +4744,16 @@
+   [(set_attr "type" "ffarith<s>")]
+ )
+ 
+-(define_insn "sqrt<mode>2"
++(define_expand "sqrt<mode>2"
++  [(set (match_operand:GPF 0 "register_operand")
++        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
++  "TARGET_FLOAT"
++{
++  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
++    DONE;
++})
++
++(define_insn "*sqrt<mode>2"
+   [(set (match_operand:GPF 0 "register_operand" "=w")
+         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
+   "TARGET_FLOAT"
+@@ -5191,7 +5264,7 @@
  	 UNSPEC_SP_TEST))
     (clobber (match_scratch:PTR 3 "=&r"))]
    ""
@@ -899,79 +2107,77 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    [(set_attr "length" "12")
     (set_attr "type" "multiple")])
  
+--- a/src/gcc/config/aarch64/aarch64.opt
++++ b/src/gcc/config/aarch64/aarch64.opt
+@@ -151,5 +151,19 @@ PC relative literal loads.
+ 
+ mlow-precision-recip-sqrt
+ Common Var(flag_mrecip_low_precision_sqrt) Optimization
+-When calculating the reciprocal square root approximation,
+-uses one less step than otherwise, thus reducing latency and precision.
++Enable the reciprocal square root approximation.  Enabling this reduces
++precision of reciprocal square root results to about 16 bits for
++single precision and to 32 bits for double precision.
++
++mlow-precision-sqrt
++Common Var(flag_mlow_precision_sqrt) Optimization
++Enable the square root approximation.  Enabling this reduces
++precision of square root results to about 16 bits for
++single precision and to 32 bits for double precision.
++If enabled, it implies -mlow-precision-recip-sqrt.
++
++mlow-precision-div
++Common Var(flag_mlow_precision_div) Optimization
++Enable the division approximation.  Enabling this reduces
++precision of division results to about 16 bits for
++single precision and to 32 bits for double precision.
 --- a/src/gcc/config/aarch64/arm_neon.h
 +++ b/src/gcc/config/aarch64/arm_neon.h
-@@ -7938,61 +7938,6 @@ vmovn_u64 (uint64x2_t a)
+@@ -5440,17 +5440,6 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
    return result;
  }
  
 -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmul_n_f32 (float32x2_t a, float32_t b)
+-vabd_f32 (float32x2_t a, float32x2_t b)
 -{
 -  float32x2_t result;
--  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmul_n_s16 (int16x4_t a, int16_t b)
--{
--  int16x4_t result;
--  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmul_n_s32 (int32x2_t a, int32_t b)
--{
--  int32x2_t result;
--  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+-  __asm__ ("fabd %0.2s, %1.2s, %2.2s"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
 -
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmul_n_u16 (uint16x4_t a, uint16_t b)
--{
--  uint16x4_t result;
--  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmul_n_u32 (uint32x2_t a, uint32_t b)
+ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+ vabd_s8 (int8x8_t a, int8x8_t b)
+ {
+@@ -5517,17 +5506,6 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
+   return result;
+ }
+ 
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vabdd_f64 (float64_t a, float64_t b)
 -{
--  uint32x2_t result;
--  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+-  float64_t result;
+-  __asm__ ("fabd %d0, %d1, %d2"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
 -
- #define vmull_high_lane_s16(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
-@@ -8443,227 +8388,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
+ __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+ vabdl_high_s8 (int8x16_t a, int8x16_t b)
+ {
+@@ -5660,28 +5638,6 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
    return result;
  }
  
 -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulq_n_f32 (float32x4_t a, float32_t b)
+-vabdq_f32 (float32x4_t a, float32x4_t b)
 -{
 -  float32x4_t result;
--  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
+-  __asm__ ("fabd %0.4s, %1.4s, %2.4s"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
@@ -979,34 +2185,390 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 -}
 -
 -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulq_n_f64 (float64x2_t a, float64_t b)
+-vabdq_f64 (float64x2_t a, float64x2_t b)
 -{
 -  float64x2_t result;
--  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
+-  __asm__ ("fabd %0.2d, %1.2d, %2.2d"
 -           : "=w"(result)
 -           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
 -
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmulq_n_s16 (int16x8_t a, int16_t b)
+ __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+ vabdq_s8 (int8x16_t a, int8x16_t b)
+ {
+@@ -5748,17 +5704,6 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
+   return result;
+ }
+ 
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vabds_f32 (float32_t a, float32_t b)
 -{
--  int16x8_t result;
--  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
+-  float32_t result;
+-  __asm__ ("fabd %s0, %s1, %s2"
 -           : "=w"(result)
--           : "w"(a), "x"(b)
+-           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
 -
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmulq_n_s32 (int32x4_t a, int32_t b)
--{
--  int32x4_t result;
--  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
+ __extension__ static __inline int16_t __attribute__ ((__always_inline__))
+ vaddlv_s8 (int8x8_t a)
+ {
+@@ -6025,246 +5970,6 @@ vaddlvq_u32 (uint32x4_t a)
+        result;                                                          \
+      })
+ 
+-#define vcvt_n_f32_s32(a, b)                                            \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int32x2_t a_ = (a);                                              \
+-       float32x2_t result;                                              \
+-       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvt_n_f32_u32(a, b)                                            \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32x2_t a_ = (a);                                             \
+-       float32x2_t result;                                              \
+-       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvt_n_s32_f32(a, b)                                            \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float32x2_t a_ = (a);                                            \
+-       int32x2_t result;                                                \
+-       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvt_n_u32_f32(a, b)                                            \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float32x2_t a_ = (a);                                            \
+-       uint32x2_t result;                                               \
+-       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtd_n_f64_s64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int64_t a_ = (a);                                                \
+-       float64_t result;                                                \
+-       __asm__ ("scvtf %d0,%d1,%2"                                      \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtd_n_f64_u64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint64_t a_ = (a);                                               \
+-       float64_t result;                                                \
+-       __asm__ ("ucvtf %d0,%d1,%2"                                      \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtd_n_s64_f64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float64_t a_ = (a);                                              \
+-       int64_t result;                                                  \
+-       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtd_n_u64_f64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float64_t a_ = (a);                                              \
+-       uint64_t result;                                                 \
+-       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_f32_s32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int32x4_t a_ = (a);                                              \
+-       float32x4_t result;                                              \
+-       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_f32_u32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32x4_t a_ = (a);                                             \
+-       float32x4_t result;                                              \
+-       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_f64_s64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int64x2_t a_ = (a);                                              \
+-       float64x2_t result;                                              \
+-       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_f64_u64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint64x2_t a_ = (a);                                             \
+-       float64x2_t result;                                              \
+-       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_s32_f32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float32x4_t a_ = (a);                                            \
+-       int32x4_t result;                                                \
+-       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_s64_f64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float64x2_t a_ = (a);                                            \
+-       int64x2_t result;                                                \
+-       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_u32_f32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float32x4_t a_ = (a);                                            \
+-       uint32x4_t result;                                               \
+-       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvtq_n_u64_f64(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float64x2_t a_ = (a);                                            \
+-       uint64x2_t result;                                               \
+-       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvts_n_f32_s32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       int32_t a_ = (a);                                                \
+-       float32_t result;                                                \
+-       __asm__ ("scvtf %s0,%s1,%2"                                      \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvts_n_f32_u32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       uint32_t a_ = (a);                                               \
+-       float32_t result;                                                \
+-       __asm__ ("ucvtf %s0,%s1,%2"                                      \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvts_n_s32_f32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float32_t a_ = (a);                                              \
+-       int32_t result;                                                  \
+-       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+-#define vcvts_n_u32_f32(a, b)                                           \
+-  __extension__                                                         \
+-    ({                                                                  \
+-       float32_t a_ = (a);                                              \
+-       uint32_t result;                                                 \
+-       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
+-                : "=w"(result)                                          \
+-                : "w"(a_), "i"(b)                                       \
+-                : /* No clobbers */);                                   \
+-       result;                                                          \
+-     })
+-
+ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+ vcvtx_f32_f64 (float64x2_t a)
+ {
+@@ -7938,61 +7643,6 @@ vmovn_u64 (uint64x2_t a)
+   return result;
+ }
+ 
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vmul_n_f32 (float32x2_t a, float32_t b)
+-{
+-  float32x2_t result;
+-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+-vmul_n_s16 (int16x4_t a, int16_t b)
+-{
+-  int16x4_t result;
+-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+-vmul_n_s32 (int32x2_t a, int32_t b)
+-{
+-  int32x2_t result;
+-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+-vmul_n_u16 (uint16x4_t a, uint16_t b)
+-{
+-  uint16x4_t result;
+-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+-vmul_n_u32 (uint32x2_t a, uint32_t b)
+-{
+-  uint32x2_t result;
+-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ #define vmull_high_lane_s16(a, b, c)                                    \
+   __extension__                                                         \
+     ({                                                                  \
+@@ -8443,227 +8093,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
+   return result;
+ }
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_f32 (float32x4_t a, float32_t b)
+-{
+-  float32x4_t result;
+-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vmulq_n_f64 (float64x2_t a, float64_t b)
+-{
+-  float64x2_t result;
+-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmulq_n_s16 (int16x8_t a, int16_t b)
+-{
+-  int16x8_t result;
+-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmulq_n_s32 (int32x4_t a, int32_t b)
+-{
+-  int32x4_t result;
+-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
 -           : /* No clobbers */);
 -  return result;
 -}
@@ -1111,87 +2673,600 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 -}
 -
 -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vmvnq_p8 (poly8x16_t a)
--{
--  poly8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
+-vmvnq_p8 (poly8x16_t a)
+-{
+-  poly8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+-vmvnq_s8 (int8x16_t a)
+-{
+-  int8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+-vmvnq_s16 (int16x8_t a)
+-{
+-  int16x8_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+-vmvnq_s32 (int32x4_t a)
+-{
+-  int32x4_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+-vmvnq_u8 (uint8x16_t a)
+-{
+-  uint8x16_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+-vmvnq_u16 (uint16x8_t a)
+-{
+-  uint16x8_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+-vmvnq_u32 (uint32x4_t a)
+-{
+-  uint32x4_t result;
+-  __asm__ ("mvn %0.16b,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-
+ __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+ vpadal_s8 (int16x4_t a, int8x8_t b)
+ {
+@@ -8785,24 +8214,13 @@ vpadalq_u16 (uint32x4_t a, uint16x8_t b)
+   return result;
+ }
+ 
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+-vpadalq_u32 (uint64x2_t a, uint32x4_t b)
+-{
+-  uint64x2_t result;
+-  __asm__ ("uadalp %0.2d,%2.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vpadd_f32 (float32x2_t a, float32x2_t b)
++__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++vpadalq_u32 (uint64x2_t a, uint32x4_t b)
+ {
+-  float32x2_t result;
+-  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
++  uint64x2_t result;
++  __asm__ ("uadalp %0.2d,%2.4s"
+            : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "0"(a), "w"(b)
+            : /* No clobbers */);
+   return result;
+ }
+@@ -8939,28 +8357,6 @@ vpaddlq_u32 (uint32x4_t a)
+   return result;
+ }
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vpaddq_f32 (float32x4_t a, float32x4_t b)
+-{
+-  float32x4_t result;
+-  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vpaddq_f64 (float64x2_t a, float64x2_t b)
+-{
+-  float64x2_t result;
+-  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+ vpaddq_s8 (int8x16_t a, int8x16_t b)
+ {
+@@ -9049,17 +8445,6 @@ vpaddq_u64 (uint64x2_t a, uint64x2_t b)
+   return result;
+ }
+ 
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vpadds_f32 (float32x2_t a)
+-{
+-  float32_t result;
+-  __asm__ ("faddp %s0,%1.2s"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+ vqdmulh_n_s16 (int16x4_t a, int16_t b)
+ {
+@@ -9679,28 +9064,6 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
+        result;                                                          \
+      })
+ 
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vrsqrte_f32 (float32x2_t a)
+-{
+-  float32x2_t result;
+-  __asm__ ("frsqrte %0.2s,%1.2s"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+-vrsqrte_f64 (float64x1_t a)
+-{
+-  float64x1_t result;
+-  __asm__ ("frsqrte %d0,%d1"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+ vrsqrte_u32 (uint32x2_t a)
+ {
+@@ -9712,39 +9075,6 @@ vrsqrte_u32 (uint32x2_t a)
+   return result;
+ }
+ 
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vrsqrted_f64 (float64_t a)
+-{
+-  float64_t result;
+-  __asm__ ("frsqrte %d0,%d1"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vrsqrteq_f32 (float32x4_t a)
+-{
+-  float32x4_t result;
+-  __asm__ ("frsqrte %0.4s,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vrsqrteq_f64 (float64x2_t a)
+-{
+-  float64x2_t result;
+-  __asm__ ("frsqrte %0.2d,%1.2d"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+ vrsqrteq_u32 (uint32x4_t a)
+ {
+@@ -9756,72 +9086,6 @@ vrsqrteq_u32 (uint32x4_t a)
+   return result;
+ }
+ 
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vrsqrtes_f32 (float32_t a)
+-{
+-  float32_t result;
+-  __asm__ ("frsqrte %s0,%s1"
+-           : "=w"(result)
+-           : "w"(a)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vrsqrts_f32 (float32x2_t a, float32x2_t b)
+-{
+-  float32x2_t result;
+-  __asm__ ("frsqrts %0.2s,%1.2s,%2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+-vrsqrtsd_f64 (float64_t a, float64_t b)
+-{
+-  float64_t result;
+-  __asm__ ("frsqrts %d0,%d1,%d2"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
+-{
+-  float32x4_t result;
+-  __asm__ ("frsqrts %0.4s,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
+-{
+-  float64x2_t result;
+-  __asm__ ("frsqrts %0.2d,%1.2d,%2.2d"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+-vrsqrtss_f32 (float32_t a, float32_t b)
+-{
+-  float32_t result;
+-  __asm__ ("frsqrts %s0,%s1,%s2"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
+-           : /* No clobbers */);
+-  return result;
+-}
+-
+ #define vshrn_high_n_s16(a, b, c)                                       \
+   __extension__                                                         \
+     ({                                                                  \
+@@ -10872,6 +10136,45 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
+ 
+ /* Start of optimal implementations in approved order.  */
+ 
++/* vabd.  */
++
++__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++vabds_f32 (float32_t __a, float32_t __b)
++{
++  return __builtin_aarch64_fabdsf (__a, __b);
++}
++
++__extension__ static __inline float64_t __attribute__ ((__always_inline__))
++vabdd_f64 (float64_t __a, float64_t __b)
++{
++  return __builtin_aarch64_fabddf (__a, __b);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vabd_f32 (float32x2_t __a, float32x2_t __b)
++{
++  return __builtin_aarch64_fabdv2sf (__a, __b);
++}
++
++__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
++vabd_f64 (float64x1_t __a, float64x1_t __b)
++{
++  return (float64x1_t) {vabdd_f64 (vget_lane_f64 (__a, 0),
++				   vget_lane_f64 (__b, 0))};
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vabdq_f32 (float32x4_t __a, float32x4_t __b)
++{
++  return __builtin_aarch64_fabdv4sf (__a, __b);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vabdq_f64 (float64x2_t __a, float64x2_t __b)
++{
++  return __builtin_aarch64_fabdv2df (__a, __b);
++}
++
+ /* vabs  */
+ 
+ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+@@ -13026,84 +12329,208 @@ vcnt_p8 (poly8x8_t __a)
+ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+ vcnt_s8 (int8x8_t __a)
+ {
+-  return __builtin_aarch64_popcountv8qi (__a);
++  return __builtin_aarch64_popcountv8qi (__a);
++}
++
++__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++vcnt_u8 (uint8x8_t __a)
++{
++  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
++}
++
++__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++vcntq_p8 (poly8x16_t __a)
++{
++  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
++}
++
++__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++vcntq_s8 (int8x16_t __a)
++{
++  return __builtin_aarch64_popcountv16qi (__a);
++}
++
++__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++vcntq_u8 (uint8x16_t __a)
++{
++  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
++}
++
++/* vcvt (double -> float).  */
++
++__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++vcvt_f16_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
++}
++
++__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
++{
++  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vcvt_f32_f64 (float64x2_t __a)
++{
++  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
++{
++  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
++}
++
++/* vcvt (float -> double).  */
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vcvt_f32_f16 (float16x4_t __a)
++{
++  return __builtin_aarch64_float_extend_lo_v4sf (__a);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vcvt_f64_f32 (float32x2_t __a)
++{
++
++  return __builtin_aarch64_float_extend_lo_v2df (__a);
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vcvt_high_f32_f16 (float16x8_t __a)
++{
++  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vcvt_high_f64_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
++}
++
++/* vcvt (<u>fixed-point -> float).  */
++
++__extension__ static __inline float64_t __attribute__ ((__always_inline__))
++vcvtd_n_f64_s64 (int64_t __a, const int __b)
++{
++  return __builtin_aarch64_scvtfdi (__a, __b);
++}
++
++__extension__ static __inline float64_t __attribute__ ((__always_inline__))
++vcvtd_n_f64_u64 (uint64_t __a, const int __b)
++{
++  return __builtin_aarch64_ucvtfdi_sus (__a, __b);
++}
++
++__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++vcvts_n_f32_s32 (int32_t __a, const int __b)
++{
++  return __builtin_aarch64_scvtfsi (__a, __b);
++}
++
++__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++vcvts_n_f32_u32 (uint32_t __a, const int __b)
++{
++  return __builtin_aarch64_ucvtfsi_sus (__a, __b);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vcvt_n_f32_s32 (int32x2_t __a, const int __b)
++{
++  return __builtin_aarch64_scvtfv2si (__a, __b);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
++{
++  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
+ }
+ 
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+-vcnt_u8 (uint8x8_t __a)
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
+ {
+-  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
++  return __builtin_aarch64_scvtfv4si (__a, __b);
+ }
+ 
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+-vcntq_p8 (poly8x16_t __a)
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
+ {
+-  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
++  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
+ }
+ 
 -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmvnq_s8 (int8x16_t a)
--{
--  int8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmvnq_s16 (int16x8_t a)
--{
--  int16x8_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmvnq_s32 (int32x4_t a)
--{
--  int32x4_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
+-vcntq_s8 (int8x16_t __a)
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
+ {
+-  return __builtin_aarch64_popcountv16qi (__a);
++  return __builtin_aarch64_scvtfv2di (__a, __b);
+ }
+ 
 -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmvnq_u8 (uint8x16_t a)
--{
--  uint8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmvnq_u16 (uint16x8_t a)
--{
--  uint16x8_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmvnq_u32 (uint32x4_t a)
--{
--  uint32x4_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
--
--
- __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
- vpadal_s8 (int16x4_t a, int8x8_t b)
+-vcntq_u8 (uint8x16_t __a)
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
+ {
+-  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
++  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
+ }
+ 
+-/* vcvt (double -> float).  */
++/* vcvt (float -> <u>fixed-point).  */
+ 
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+-vcvt_f16_f32 (float32x4_t __a)
++__extension__ static __inline int64_t __attribute__ ((__always_inline__))
++vcvtd_n_s64_f64 (float64_t __a, const int __b)
+ {
+-  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
++  return __builtin_aarch64_fcvtzsdf (__a, __b);
+ }
+ 
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+-vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
++__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
++vcvtd_n_u64_f64 (float64_t __a, const int __b)
+ {
+-  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
++  return __builtin_aarch64_fcvtzudf_uss (__a, __b);
+ }
+ 
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+-vcvt_f32_f64 (float64x2_t __a)
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvts_n_s32_f32 (float32_t __a, const int __b)
+ {
+-  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
++  return __builtin_aarch64_fcvtzssf (__a, __b);
+ }
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvts_n_u32_f32 (float32_t __a, const int __b)
+ {
+-  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
++  return __builtin_aarch64_fcvtzusf_uss (__a, __b);
+ }
+ 
+-/* vcvt (float -> double).  */
++__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++vcvt_n_s32_f32 (float32x2_t __a, const int __b)
++{
++  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
++}
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvt_f32_f16 (float16x4_t __a)
++__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++vcvt_n_u32_f32 (float32x2_t __a, const int __b)
+ {
+-  return __builtin_aarch64_float_extend_lo_v4sf (__a);
++  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
+ }
+ 
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vcvt_f64_f32 (float32x2_t __a)
++__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
+ {
++  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
++}
+ 
+-  return __builtin_aarch64_float_extend_lo_v2df (__a);
++__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
++{
++  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
+ }
+ 
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+-vcvt_high_f32_f16 (float16x8_t __a)
++__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
+ {
+-  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
++  return __builtin_aarch64_fcvtzsv2df (__a, __b);
+ }
+ 
+-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+-vcvt_high_f64_f32 (float32x4_t __a)
++__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
  {
-@@ -14456,6 +14180,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+-  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
++  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
+ }
+ 
+ /* vcvt  (<u>int -> float)  */
+@@ -14456,6 +13883,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
    return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
  }
  
@@ -1204,7 +3279,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
  {
-@@ -14597,6 +14327,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+@@ -14597,6 +14030,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
    return __builtin_aarch64_fmav2df (-__b, __c, __a);
  }
  
@@ -1234,7 +3309,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  
  /* vfms_lane  */
  
-@@ -18895,6 +18648,160 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
+@@ -18895,6 +18351,160 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
    return __a * __aarch64_vget_lane_any (__b, __lane);
  }
  
@@ -1395,9 +3470,160 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  /* vneg  */
  
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+@@ -18971,6 +18581,24 @@ vnegq_s64 (int64x2_t __a)
+ 
+ /* vpadd  */
+ 
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vpadd_f32 (float32x2_t __a, float32x2_t __b)
++{
++  return __builtin_aarch64_faddpv2sf (__a, __b);
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vpaddq_f32 (float32x4_t __a, float32x4_t __b)
++{
++  return __builtin_aarch64_faddpv4sf (__a, __b);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vpaddq_f64 (float64x2_t __a, float64x2_t __b)
++{
++  return __builtin_aarch64_faddpv2df (__a, __b);
++}
++
+ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+ vpadd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+@@ -19010,6 +18638,12 @@ vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
+ 						  (int32x2_t) __b);
+ }
+ 
++__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++vpadds_f32 (float32x2_t __a)
++{
++  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
++}
++
+ __extension__ static __inline float64_t __attribute__ ((__always_inline__))
+ vpaddd_f64 (float64x2_t __a)
+ {
+@@ -21713,6 +21347,83 @@ vrshrd_n_u64 (uint64_t __a, const int __b)
+   return __builtin_aarch64_urshr_ndi_uus (__a, __b);
+ }
+ 
++/* vrsqrte.  */
++
++__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++vrsqrtes_f32 (float32_t __a)
++{
++  return __builtin_aarch64_rsqrtesf (__a);
++}
++
++__extension__ static __inline float64_t __attribute__ ((__always_inline__))
++vrsqrted_f64 (float64_t __a)
++{
++  return __builtin_aarch64_rsqrtedf (__a);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vrsqrte_f32 (float32x2_t __a)
++{
++  return __builtin_aarch64_rsqrtev2sf (__a);
++}
++
++__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
++vrsqrte_f64 (float64x1_t __a)
++{
++  return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))};
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vrsqrteq_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_rsqrtev4sf (__a);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vrsqrteq_f64 (float64x2_t __a)
++{
++  return __builtin_aarch64_rsqrtev2df (__a);
++}
++
++/* vrsqrts.  */
++
++__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++vrsqrtss_f32 (float32_t __a, float32_t __b)
++{
++  return __builtin_aarch64_rsqrtssf (__a, __b);
++}
++
++__extension__ static __inline float64_t __attribute__ ((__always_inline__))
++vrsqrtsd_f64 (float64_t __a, float64_t __b)
++{
++  return __builtin_aarch64_rsqrtsdf (__a, __b);
++}
++
++__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
++{
++  return __builtin_aarch64_rsqrtsv2sf (__a, __b);
++}
++
++__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
++vrsqrts_f64 (float64x1_t __a, float64x1_t __b)
++{
++  return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0),
++				      vget_lane_f64 (__b, 0))};
++}
++
++__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
++{
++  return __builtin_aarch64_rsqrtsv4sf (__a, __b);
++}
++
++__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
++vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b)
++{
++  return __builtin_aarch64_rsqrtsv2df (__a, __b);
++}
++
+ /* vrsra */
+ 
+ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 --- a/src/gcc/config/aarch64/iterators.md
 +++ b/src/gcc/config/aarch64/iterators.md
-@@ -715,6 +715,7 @@
+@@ -154,6 +154,12 @@
+ ;; Vector modes for S type.
+ (define_mode_iterator VDQ_SI [V2SI V4SI])
+ 
++;; Vector modes for S and D
++(define_mode_iterator VDQ_SDI [V2SI V4SI V2DI])
++
++;; Scalar and Vector modes for S and D
++(define_mode_iterator VSDQ_SDI [V2SI V4SI V2DI SI DI])
++
+ ;; Vector modes for Q and H types.
+ (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
+ 
+@@ -648,8 +654,13 @@
+ (define_mode_attr atomic_sfx
+   [(QI "b") (HI "h") (SI "") (DI "")])
+ 
+-(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si") (SF "si") (DF "di")])
+-(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI") (SF "SI") (DF "DI")])
++(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
++			       (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
++			       (SF "si") (DF "di") (SI "sf") (DI "df")])
++(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
++			       (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
++			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")])
++
+ 
+ ;; for the inequal width integer to fp conversions
+ (define_mode_attr fcvt_iesize [(SF "di") (DF "si")])
+@@ -715,6 +726,7 @@
  (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
  (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
  
@@ -1405,9 +3631,42 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
  
  ;; -fpic small model GOT reloc modifers: gotpage_lo15/lo14 for ILP64/32.
+@@ -1001,6 +1013,9 @@
+ (define_int_iterator FCVT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
+ 			    UNSPEC_FRINTA UNSPEC_FRINTN])
+ 
++(define_int_iterator FCVT_F2FIXED [UNSPEC_FCVTZS UNSPEC_FCVTZU])
++(define_int_iterator FCVT_FIXED2F [UNSPEC_SCVTF UNSPEC_UCVTF])
++
+ (define_int_iterator FRECP [UNSPEC_FRECPE UNSPEC_FRECPX])
+ 
+ (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
+@@ -1137,6 +1152,11 @@
+ 			       (UNSPEC_FRINTP "ceil") (UNSPEC_FRINTM "floor")
+ 			       (UNSPEC_FRINTN "frintn")])
+ 
++(define_int_attr fcvt_fixed_insn [(UNSPEC_SCVTF "scvtf")
++				  (UNSPEC_UCVTF "ucvtf")
++				  (UNSPEC_FCVTZS "fcvtzs")
++				  (UNSPEC_FCVTZU "fcvtzu")])
++
+ (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
+ 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
+ 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
 --- a/src/gcc/config/arm/arm-protos.h
 +++ b/src/gcc/config/arm/arm-protos.h
-@@ -319,6 +319,7 @@ extern int vfp3_const_double_for_bits (rtx);
+@@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
+ 			      ATTRIBUTE_UNUSED);
+ extern void arm_init_builtins (void);
+ extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
+-
++extern rtx arm_simd_vect_par_cnst_half (machine_mode mode, bool high);
++extern bool arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
++						 bool high);
+ #ifdef RTX_CODE
+ extern bool arm_vector_mode_supported_p (machine_mode);
+ extern bool arm_small_register_classes_for_mode_p (machine_mode);
+@@ -319,6 +321,7 @@ extern int vfp3_const_double_for_bits (rtx);
  
  extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
  					   rtx);
@@ -1415,7 +3674,7 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  extern bool arm_valid_symbolic_address_p (rtx);
  extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
  #endif /* RTX_CODE */
-@@ -601,6 +602,9 @@ extern int arm_tune_cortex_a9;
+@@ -601,6 +604,9 @@ extern int arm_tune_cortex_a9;
     interworking clean.  */
  extern int arm_cpp_interwork;
  
@@ -1501,6 +3760,87 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
  
  static unsigned HOST_WIDE_INT
+@@ -30311,4 +30322,80 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
+   return;
+ }
+ 
++
++/* Construct and return a PARALLEL RTX vector with elements numbering the
++   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
++   the vector - from the perspective of the architecture.  This does not
++   line up with GCC's perspective on lane numbers, so we end up with
++   different masks depending on our target endian-ness.  The diagram
++   below may help.  We must draw the distinction when building masks
++   which select one half of the vector.  An instruction selecting
++   architectural low-lanes for a big-endian target, must be described using
++   a mask selecting GCC high-lanes.
++
++                 Big-Endian             Little-Endian
++
++GCC             0   1   2   3           3   2   1   0
++              | x | x | x | x |       | x | x | x | x |
++Architecture    3   2   1   0           3   2   1   0
++
++Low Mask:         { 2, 3 }                { 0, 1 }
++High Mask:        { 0, 1 }                { 2, 3 }
++*/
++
++rtx
++arm_simd_vect_par_cnst_half (machine_mode mode, bool high)
++{
++  int nunits = GET_MODE_NUNITS (mode);
++  rtvec v = rtvec_alloc (nunits / 2);
++  int high_base = nunits / 2;
++  int low_base = 0;
++  int base;
++  rtx t1;
++  int i;
++
++  if (BYTES_BIG_ENDIAN)
++    base = high ? low_base : high_base;
++  else
++    base = high ? high_base : low_base;
++
++  for (i = 0; i < nunits / 2; i++)
++    RTVEC_ELT (v, i) = GEN_INT (base + i);
++
++  t1 = gen_rtx_PARALLEL (mode, v);
++  return t1;
++}
++
++/* Check OP for validity as a PARALLEL RTX vector with elements
++   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
++   from the perspective of the architecture.  See the diagram above
++   arm_simd_vect_par_cnst_half_p for more details.  */
++
++bool
++arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
++				       bool high)
++{
++  rtx ideal = arm_simd_vect_par_cnst_half (mode, high);
++  HOST_WIDE_INT count_op = XVECLEN (op, 0);
++  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
++  int i = 0;
++
++  if (!VECTOR_MODE_P (mode))
++    return false;
++
++  if (count_op != count_ideal)
++    return false;
++
++  for (i = 0; i < count_ideal; i++)
++    {
++      rtx elt_op = XVECEXP (op, 0, i);
++      rtx elt_ideal = XVECEXP (ideal, 0, i);
++
++      if (!CONST_INT_P (elt_op)
++	  || INTVAL (elt_ideal) != INTVAL (elt_op))
++	return false;
++    }
++  return true;
++}
++
+ #include "gt-arm.h"
 --- a/src/gcc/config/arm/arm.h
 +++ b/src/gcc/config/arm/arm.h
 @@ -478,6 +478,9 @@ extern int arm_tune_cortex_a9;
@@ -1531,7 +3871,27 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
     profile.  */
 --- a/src/gcc/config/arm/arm.md
 +++ b/src/gcc/config/arm/arm.md
-@@ -8152,8 +8152,8 @@
+@@ -121,7 +121,7 @@
+ ; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6.  This attribute is
+ ; used to compute attribute "enabled", use type "any" to enable an
+ ; alternative in all cases.
+-(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,neon_for_64bits,avoid_neon_for_64bits,iwmmxt,iwmmxt2,armv6_or_vfpv3"
++(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,neon_for_64bits,avoid_neon_for_64bits,iwmmxt,iwmmxt2,armv6_or_vfpv3,neon"
+   (const_string "any"))
+ 
+ (define_attr "arch_enabled" "no,yes"
+@@ -177,6 +177,10 @@
+ 	 (and (eq_attr "arch" "armv6_or_vfpv3")
+ 	      (match_test "arm_arch6 || TARGET_VFP3"))
+ 	 (const_string "yes")
++
++	 (and (eq_attr "arch" "neon")
++	      (match_test "TARGET_NEON"))
++	 (const_string "yes")
+ 	]
+ 
+ 	(const_string "no")))
+@@ -8152,8 +8156,8 @@
  )
  
  (define_insn "probe_stack"
@@ -1542,8 +3902,96 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    "TARGET_32BIT"
    "str%?\\tr0, %0"
    [(set_attr "type" "store1")
+@@ -10821,19 +10825,22 @@
+    (set_attr "predicable_short_it" "no")
+    (set_attr "type" "clz")])
+ 
+-(define_expand "ctzsi2"
+- [(set (match_operand:SI           0 "s_register_operand" "")
+-       (ctz:SI (match_operand:SI  1 "s_register_operand" "")))]
++;; Keep this as a CTZ expression until after reload and then split
++;; into RBIT + CLZ.  Since RBIT is represented as an UNSPEC it is unlikely
++;; to fold with any other expression.
++
++(define_insn_and_split "ctzsi2"
++ [(set (match_operand:SI           0 "s_register_operand" "=r")
++       (ctz:SI (match_operand:SI  1 "s_register_operand" "r")))]
+   "TARGET_32BIT && arm_arch_thumb2"
++  "#"
++  "&& reload_completed"
++  [(const_int 0)]
+   "
+-   {
+-     rtx tmp = gen_reg_rtx (SImode); 
+-     emit_insn (gen_rbitsi2 (tmp, operands[1]));
+-     emit_insn (gen_clzsi2 (operands[0], tmp));
+-   }
+-   DONE;
+-  "
+-)
++  emit_insn (gen_rbitsi2 (operands[0], operands[1]));
++  emit_insn (gen_clzsi2 (operands[0], operands[0]));
++  DONE;
++")
+ 
+ ;; V5E instructions.
+ 
 --- a/src/gcc/config/arm/arm_neon.h
 +++ b/src/gcc/config/arm/arm_neon.h
+@@ -530,7 +530,7 @@ vadd_s32 (int32x2_t __a, int32x2_t __b)
+ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+ vadd_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+   return __a + __b;
+ #else
+   return (float32x2_t) __builtin_neon_vaddv2sf (__a, __b);
+@@ -594,7 +594,7 @@ vaddq_s64 (int64x2_t __a, int64x2_t __b)
+ __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+ vaddq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+   return __a + __b;
+ #else
+   return (float32x4_t) __builtin_neon_vaddv4sf (__a, __b);
+@@ -1030,7 +1030,7 @@ vmul_s32 (int32x2_t __a, int32x2_t __b)
+ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+ vmul_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+   return __a * __b;
+ #else
+   return (float32x2_t) __builtin_neon_vmulfv2sf (__a, __b);
+@@ -1077,7 +1077,7 @@ vmulq_s32 (int32x4_t __a, int32x4_t __b)
+ __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+ vmulq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+   return __a * __b;
+ #else
+   return (float32x4_t) __builtin_neon_vmulfv4sf (__a, __b);
+@@ -1678,7 +1678,7 @@ vsub_s32 (int32x2_t __a, int32x2_t __b)
+ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+ vsub_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+   return __a - __b;
+ #else
+   return (float32x2_t) __builtin_neon_vsubv2sf (__a, __b);
+@@ -1742,7 +1742,7 @@ vsubq_s64 (int64x2_t __a, int64x2_t __b)
+ __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+ vsubq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+   return __a - __b;
+ #else
+   return (float32x4_t) __builtin_neon_vsubv4sf (__a, __b);
 @@ -2607,6 +2607,12 @@ vtst_p8 (poly8x8_t __a, poly8x8_t __b)
    return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
  }
@@ -1603,6 +4051,209 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  )
  
  (define_insn "crypto_<crypto_pattern>"
+--- a/src/gcc/config/arm/neon.md
++++ b/src/gcc/config/arm/neon.md
+@@ -1204,16 +1204,133 @@
+ 
+ ;; Widening operations
+ 
++(define_expand "widen_ssum<mode>3"
++  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
++	(plus:<V_double_width>
++	 (sign_extend:<V_double_width>
++	  (match_operand:VQI 1 "s_register_operand" ""))
++	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
++  "TARGET_NEON"
++  {
++    machine_mode mode = GET_MODE (operands[1]);
++    rtx p1, p2;
++
++    p1  = arm_simd_vect_par_cnst_half (mode, false);
++    p2  = arm_simd_vect_par_cnst_half (mode, true);
++
++    if (operands[0] != operands[2])
++      emit_move_insn (operands[0], operands[2]);
++
++    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0],
++							 operands[1],
++							 p1,
++							 operands[0]));
++    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0],
++							 operands[1],
++							 p2,
++							 operands[0]));
++    DONE;
++  }
++)
++
++(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
++  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
++	(plus:<VW:V_widen>
++	 (sign_extend:<VW:V_widen>
++	  (vec_select:VW
++	   (match_operand:VQI 1 "s_register_operand" "%w")
++	   (match_operand:VQI 2 "vect_par_constant_low" "")))
++	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
++  "TARGET_NEON"
++{
++  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
++    "vaddw.<V_s_elem>\t%q0, %q3, %e1";
++}
++  [(set_attr "type" "neon_add_widen")])
++
++(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
++  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
++	(plus:<VW:V_widen>
++	 (sign_extend:<VW:V_widen>
++	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
++			 (match_operand:VQI 2 "vect_par_constant_high" "")))
++	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
++  "TARGET_NEON"
++{
++  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
++    "vaddw.<V_s_elem>\t%q0, %q3, %f1";
++}
++  [(set_attr "type" "neon_add_widen")])
++
+ (define_insn "widen_ssum<mode>3"
+   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+-	(plus:<V_widen> (sign_extend:<V_widen>
+-			  (match_operand:VW 1 "s_register_operand" "%w"))
+-		        (match_operand:<V_widen> 2 "s_register_operand" "w")))]
++	(plus:<V_widen>
++	 (sign_extend:<V_widen>
++	  (match_operand:VW 1 "s_register_operand" "%w"))
++	 (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+   "TARGET_NEON"
+   "vaddw.<V_s_elem>\t%q0, %q2, %P1"
+   [(set_attr "type" "neon_add_widen")]
+ )
+ 
++(define_expand "widen_usum<mode>3"
++  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
++	(plus:<V_double_width>
++	 (zero_extend:<V_double_width>
++	  (match_operand:VQI 1 "s_register_operand" ""))
++	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
++  "TARGET_NEON"
++  {
++    machine_mode mode = GET_MODE (operands[1]);
++    rtx p1, p2;
++
++    p1  = arm_simd_vect_par_cnst_half (mode, false);
++    p2  = arm_simd_vect_par_cnst_half (mode, true);
++
++    if (operands[0] != operands[2])
++      emit_move_insn (operands[0], operands[2]);
++
++    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0],
++							 operands[1],
++							 p1,
++							 operands[0]));
++    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0],
++							 operands[1],
++							 p2,
++							 operands[0]));
++    DONE;
++  }
++)
++
++(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
++  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
++	(plus:<VW:V_widen>
++	 (zero_extend:<VW:V_widen>
++	  (vec_select:VW
++	   (match_operand:VQI 1 "s_register_operand" "%w")
++	   (match_operand:VQI 2 "vect_par_constant_low" "")))
++	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
++  "TARGET_NEON"
++{
++  return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
++    "vaddw.<V_u_elem>\t%q0, %q3, %e1";
++}
++  [(set_attr "type" "neon_add_widen")])
++
++(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
++  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
++	(plus:<VW:V_widen>
++	 (zero_extend:<VW:V_widen>
++	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
++			 (match_operand:VQI 2 "vect_par_constant_high" "")))
++	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
++  "TARGET_NEON"
++{
++ return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
++    "vaddw.<V_u_elem>\t%q0, %q3, %f1";
++}
++  [(set_attr "type" "neon_add_widen")])
++
+ (define_insn "widen_usum<mode>3"
+   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ 	(plus:<V_widen> (zero_extend:<V_widen>
+--- a/src/gcc/config/arm/predicates.md
++++ b/src/gcc/config/arm/predicates.md
+@@ -612,59 +612,13 @@
+ (define_special_predicate "vect_par_constant_high" 
+   (match_code "parallel")
+ {
+-  HOST_WIDE_INT count = XVECLEN (op, 0);
+-  int i;
+-  int base = GET_MODE_NUNITS (mode);
+-
+-  if ((count < 1)
+-      || (count != base/2))
+-    return false;
+-    
+-  if (!VECTOR_MODE_P (mode))
+-    return false;
+-
+-  for (i = 0; i < count; i++)
+-   {
+-     rtx elt = XVECEXP (op, 0, i);
+-     int val;
+-
+-     if (!CONST_INT_P (elt))
+-       return false;
+-
+-     val = INTVAL (elt);
+-     if (val != (base/2) + i)
+-       return false;
+-   }
+-  return true; 
++  return arm_simd_check_vect_par_cnst_half_p (op, mode, true);
+ })
+ 
+ (define_special_predicate "vect_par_constant_low"
+   (match_code "parallel")
+ {
+-  HOST_WIDE_INT count = XVECLEN (op, 0);
+-  int i;
+-  int base = GET_MODE_NUNITS (mode);
+-
+-  if ((count < 1)
+-      || (count != base/2))
+-    return false;
+-    
+-  if (!VECTOR_MODE_P (mode))
+-    return false;
+-
+-  for (i = 0; i < count; i++)
+-   {
+-     rtx elt = XVECEXP (op, 0, i);
+-     int val;
+-
+-     if (!CONST_INT_P (elt))
+-       return false;
+-
+-     val = INTVAL (elt);
+-     if (val != i)
+-       return false;
+-   } 
+-  return true; 
++  return arm_simd_check_vect_par_cnst_half_p (op, mode, false);
+ })
+ 
+ (define_predicate "const_double_vcvt_power_of_two_reciprocal"
 --- a/src/gcc/config/arm/sync.md
 +++ b/src/gcc/config/arm/sync.md
 @@ -452,14 +452,13 @@
@@ -1657,6 +4308,117 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  (define_insn "*thumb_mulsi3"
    [(set (match_operand:SI          0 "register_operand" "=&l,&l,&l")
  	(mult:SI (match_operand:SI 1 "register_operand" "%l,*h,0")
+--- a/src/gcc/config/arm/vfp.md
++++ b/src/gcc/config/arm/vfp.md
+@@ -394,8 +394,8 @@
+ ;; DFmode moves
+ 
+ (define_insn "*movdf_vfp"
+-  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r, m,w,r")
+-	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w ,mF,r,w,r"))]
++  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r, m,w,r")
++	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w ,mF,r,w,r"))]
+   "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
+    && (   register_operand (operands[0], DFmode)
+        || register_operand (operands[1], DFmode))"
+@@ -410,39 +410,43 @@
+       case 2:
+ 	gcc_assert (TARGET_VFP_DOUBLE);
+         return \"vmov%?.f64\\t%P0, %1\";
+-      case 3: case 4:
++      case 3:
++	gcc_assert (TARGET_VFP_DOUBLE);
++	return \"vmov.i64\\t%P0, #0\\t%@ float\";
++      case 4: case 5:
+ 	return output_move_vfp (operands);
+-      case 5: case 6:
++      case 6: case 7:
+ 	return output_move_double (operands, true, NULL);
+-      case 7:
++      case 8:
+ 	if (TARGET_VFP_SINGLE)
+ 	  return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\";
+ 	else
+ 	  return \"vmov%?.f64\\t%P0, %P1\";
+-      case 8:
++      case 9:
+         return \"#\";
+       default:
+ 	gcc_unreachable ();
+       }
+     }
+   "
+-  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,f_loadd,f_stored,\
++  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,neon_move,f_loadd,f_stored,\
+                      load2,store2,ffarithd,multiple")
+-   (set (attr "length") (cond [(eq_attr "alternative" "5,6,8") (const_int 8)
+-			       (eq_attr "alternative" "7")
++   (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8)
++			       (eq_attr "alternative" "8")
+ 				(if_then_else
+ 				 (match_test "TARGET_VFP_SINGLE")
+ 				 (const_int 8)
+ 				 (const_int 4))]
+ 			      (const_int 4)))
+-   (set_attr "predicable" "yes")
+-   (set_attr "pool_range" "*,*,*,1020,*,1020,*,*,*")
+-   (set_attr "neg_pool_range" "*,*,*,1004,*,1004,*,*,*")]
++   (set_attr "predicable" "yes,yes,yes,no,yes,yes,yes,yes,yes,yes")
++   (set_attr "pool_range" "*,*,*,*,1020,*,1020,*,*,*")
++   (set_attr "neg_pool_range" "*,*,*,*,1004,*,1004,*,*,*")
++   (set_attr "arch" "any,any,any,neon,any,any,any,any,any,any")]
+ )
+ 
+ (define_insn "*thumb2_movdf_vfp"
+-  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r ,m,w,r")
+-	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w, mF,r, w,r"))]
++  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r ,m,w,r")
++	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w, mF,r, w,r"))]
+   "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
+    && (   register_operand (operands[0], DFmode)
+        || register_operand (operands[1], DFmode))"
+@@ -457,11 +461,14 @@
+       case 2:
+ 	gcc_assert (TARGET_VFP_DOUBLE);
+ 	return \"vmov%?.f64\\t%P0, %1\";
+-      case 3: case 4:
++      case 3:
++	gcc_assert (TARGET_VFP_DOUBLE);
++	return \"vmov.i64\\t%P0, #0\\t%@ float\";
++      case 4: case 5:
+ 	return output_move_vfp (operands);
+-      case 5: case 6: case 8:
++      case 6: case 7: case 9:
+ 	return output_move_double (operands, true, NULL);
+-      case 7:
++      case 8:
+ 	if (TARGET_VFP_SINGLE)
+ 	  return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\";
+ 	else
+@@ -471,17 +478,18 @@
+       }
+     }
+   "
+-  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,f_loadd,\
++  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,neon_move,f_loadd,\
+                      f_stored,load2,store2,ffarithd,multiple")
+-   (set (attr "length") (cond [(eq_attr "alternative" "5,6,8") (const_int 8)
+-			       (eq_attr "alternative" "7")
++   (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8)
++			       (eq_attr "alternative" "8")
+ 				(if_then_else
+ 				 (match_test "TARGET_VFP_SINGLE")
+ 				 (const_int 8)
+ 				 (const_int 4))]
+ 			      (const_int 4)))
+-   (set_attr "pool_range" "*,*,*,1018,*,4094,*,*,*")
+-   (set_attr "neg_pool_range" "*,*,*,1008,*,0,*,*,*")]
++   (set_attr "pool_range" "*,*,*,*,1018,*,4094,*,*,*")
++   (set_attr "neg_pool_range" "*,*,*,*,1008,*,0,*,*,*")
++   (set_attr "arch" "any,any,any,neon,any,any,any,any,any,any")]
+ )
+ 
+ 
 --- a/src/gcc/configure
 +++ b/src/gcc/configure
 @@ -1711,7 +1711,8 @@ Optional Packages:
@@ -1752,6 +4514,100 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
    cpp_define_formatted (pfile, "__ATOMIC_RELAXED=%d", MEMMODEL_RELAXED);
    cpp_define_formatted (pfile, "__ATOMIC_SEQ_CST=%d", MEMMODEL_SEQ_CST);
    cpp_define_formatted (pfile, "__ATOMIC_ACQUIRE=%d", MEMMODEL_ACQUIRE);
+--- a/src/gcc/ifcvt.c
++++ b/src/gcc/ifcvt.c
+@@ -817,6 +817,7 @@ struct noce_if_info
+ 
+ static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int);
+ static int noce_try_move (struct noce_if_info *);
++static int noce_try_ifelse_collapse (struct noce_if_info *);
+ static int noce_try_store_flag (struct noce_if_info *);
+ static int noce_try_addcc (struct noce_if_info *);
+ static int noce_try_store_flag_constants (struct noce_if_info *);
+@@ -1120,6 +1121,37 @@ noce_try_move (struct noce_if_info *if_info)
+   return FALSE;
+ }
+ 
++/* Try forming an IF_THEN_ELSE (cond, b, a) and collapsing that
++   through simplify_rtx.  Sometimes that can eliminate the IF_THEN_ELSE.
++   If that is the case, emit the result into x.  */
++
++static int
++noce_try_ifelse_collapse (struct noce_if_info * if_info)
++{
++  if (!noce_simple_bbs (if_info))
++    return FALSE;
++
++  machine_mode mode = GET_MODE (if_info->x);
++  rtx if_then_else = simplify_gen_ternary (IF_THEN_ELSE, mode, mode,
++					    if_info->cond, if_info->b,
++					    if_info->a);
++
++  if (GET_CODE (if_then_else) == IF_THEN_ELSE)
++    return FALSE;
++
++  rtx_insn *seq;
++  start_sequence ();
++  noce_emit_move_insn (if_info->x, if_then_else);
++  seq = end_ifcvt_sequence (if_info);
++  if (!seq)
++    return FALSE;
++
++  emit_insn_before_setloc (seq, if_info->jump,
++			  INSN_LOCATION (if_info->insn_a));
++  return TRUE;
++}
++
++
+ /* Convert "if (test) x = 1; else x = 0".
+ 
+    Only try 0 and STORE_FLAG_VALUE here.  Other combinations will be
+@@ -2364,28 +2396,32 @@ noce_get_alt_condition (struct noce_if_info *if_info, rtx target,
+ 	  switch (code)
+ 	    {
+ 	    case LT:
+-	      if (actual_val == desired_val + 1)
++	      if (desired_val != HOST_WIDE_INT_MAX
++		  && actual_val == desired_val + 1)
+ 		{
+ 		  code = LE;
+ 		  op_b = GEN_INT (desired_val);
+ 		}
+ 	      break;
+ 	    case LE:
+-	      if (actual_val == desired_val - 1)
++	      if (desired_val != HOST_WIDE_INT_MIN
++		  && actual_val == desired_val - 1)
+ 		{
+ 		  code = LT;
+ 		  op_b = GEN_INT (desired_val);
+ 		}
+ 	      break;
+ 	    case GT:
+-	      if (actual_val == desired_val - 1)
++	      if (desired_val != HOST_WIDE_INT_MIN
++		  && actual_val == desired_val - 1)
+ 		{
+ 		  code = GE;
+ 		  op_b = GEN_INT (desired_val);
+ 		}
+ 	      break;
+ 	    case GE:
+-	      if (actual_val == desired_val + 1)
++	      if (desired_val != HOST_WIDE_INT_MAX
++		  && actual_val == desired_val + 1)
+ 		{
+ 		  code = GT;
+ 		  op_b = GEN_INT (desired_val);
+@@ -3493,6 +3529,8 @@ noce_process_if_block (struct noce_if_info *if_info)
+ 
+   if (noce_try_move (if_info))
+     goto success;
++  if (noce_try_ifelse_collapse (if_info))
++    goto success;
+   if (noce_try_store_flag (if_info))
+     goto success;
+   if (noce_try_bitop (if_info))
 --- a/src/gcc/internal-fn.c
 +++ b/src/gcc/internal-fn.c
 @@ -1807,11 +1807,7 @@ expand_arith_overflow (enum tree_code code, gimple *stmt)
@@ -1767,6 +4623,32 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  	  enum machine_mode m = smallest_mode_for_size (p, MODE_INT);
  	  tree optype = build_nonstandard_integer_type (GET_MODE_PRECISION (m),
  							uns0_p && uns1_p
+--- a/src/gcc/lra-constraints.c
++++ b/src/gcc/lra-constraints.c
+@@ -1303,7 +1303,22 @@ process_addr_reg (rtx *loc, bool check_only_p, rtx_insn **before, rtx_insn **aft
+ 
+   subreg_p = GET_CODE (*loc) == SUBREG;
+   if (subreg_p)
+-    loc = &SUBREG_REG (*loc);
++    {
++      reg = SUBREG_REG (*loc);
++      mode = GET_MODE (reg);
++
++      /* For mode with size bigger than ptr_mode, there unlikely to be "mov"
++	 between two registers with different classes, but there normally will
++	 be "mov" which transfers element of vector register into the general
++	 register, and this normally will be a subreg which should be reloaded
++	 as a whole.  This is particularly likely to be triggered when
++	 -fno-split-wide-types specified.  */
++      if (!REG_P (reg)
++	  || in_class_p (reg, cl, &new_class)
++	  || GET_MODE_SIZE (mode) <= GET_MODE_SIZE (ptr_mode))
++       loc = &SUBREG_REG (*loc);
++    }
++
+   reg = *loc;
+   mode = GET_MODE (reg);
+   if (! REG_P (reg))
 --- a/src/gcc/lto/lto-partition.c
 +++ b/src/gcc/lto/lto-partition.c
 @@ -447,7 +447,7 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition)
@@ -1840,6 +4722,98 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  
  /* Diagnostic parameters.  */
  
+--- a/src/gcc/rtlanal.c
++++ b/src/gcc/rtlanal.c
+@@ -3657,6 +3657,16 @@ subreg_get_info (unsigned int xregno, machine_mode xmode,
+ 	  info->offset = offset / regsize_xmode;
+ 	  return;
+ 	}
++      /* It's not valid to extract a subreg of mode YMODE at OFFSET that
++	 would go outside of XMODE.  */
++      if (!rknown
++	  && GET_MODE_SIZE (ymode) + offset > GET_MODE_SIZE (xmode))
++	{
++	  info->representable_p = false;
++	  info->nregs = nregs_ymode;
++	  info->offset = offset / regsize_xmode;
++	  return;
++	}
+       /* Quick exit for the simple and common case of extracting whole
+ 	 subregisters from a multiregister value.  */
+       /* ??? It would be better to integrate this into the code below,
+--- a/src/gcc/simplify-rtx.c
++++ b/src/gcc/simplify-rtx.c
+@@ -5267,6 +5267,50 @@ simplify_const_relational_operation (enum rtx_code code,
+ 
+   return 0;
+ }
++
++/* Recognize expressions of the form (X CMP 0) ? VAL : OP (X)
++   where OP is CLZ or CTZ and VAL is the value from CLZ_DEFINED_VALUE_AT_ZERO
++   or CTZ_DEFINED_VALUE_AT_ZERO respectively and return OP (X) if the expression
++   can be simplified to that or NULL_RTX if not.
++   Assume X is compared against zero with CMP_CODE and the true
++   arm is TRUE_VAL and the false arm is FALSE_VAL.  */
++
++static rtx
++simplify_cond_clz_ctz (rtx x, rtx_code cmp_code, rtx true_val, rtx false_val)
++{
++  if (cmp_code != EQ && cmp_code != NE)
++    return NULL_RTX;
++
++  /* Result on X == 0 and X !=0 respectively.  */
++  rtx on_zero, on_nonzero;
++  if (cmp_code == EQ)
++    {
++      on_zero = true_val;
++      on_nonzero = false_val;
++    }
++  else
++    {
++      on_zero = false_val;
++      on_nonzero = true_val;
++    }
++
++  rtx_code op_code = GET_CODE (on_nonzero);
++  if ((op_code != CLZ && op_code != CTZ)
++      || !rtx_equal_p (XEXP (on_nonzero, 0), x)
++      || !CONST_INT_P (on_zero))
++    return NULL_RTX;
++
++  HOST_WIDE_INT op_val;
++  if (((op_code == CLZ
++	&& CLZ_DEFINED_VALUE_AT_ZERO (GET_MODE (on_nonzero), op_val))
++      || (op_code == CTZ
++	  && CTZ_DEFINED_VALUE_AT_ZERO (GET_MODE (on_nonzero), op_val)))
++      && op_val == INTVAL (on_zero))
++    return on_nonzero;
++
++  return NULL_RTX;
++}
++
+ 

+ /* Simplify CODE, an operation with result mode MODE and three operands,
+    OP0, OP1, and OP2.  OP0_MODE was the mode of OP0 before it became
+@@ -5400,6 +5444,19 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
+ 	    }
+ 	}
+ 
++      /* Convert x == 0 ? N : clz (x) into clz (x) when
++	 CLZ_DEFINED_VALUE_AT_ZERO is defined to N for the mode of x.
++	 Similarly for ctz (x).  */
++      if (COMPARISON_P (op0) && !side_effects_p (op0)
++	  && XEXP (op0, 1) == const0_rtx)
++	{
++	  rtx simplified
++	    = simplify_cond_clz_ctz (XEXP (op0, 0), GET_CODE (op0),
++				     op1, op2);
++	  if (simplified)
++	    return simplified;
++	}
++
+       if (COMPARISON_P (op0) && ! side_effects_p (op0))
+ 	{
+ 	  machine_mode cmp_mode = (GET_MODE (XEXP (op0, 0)) == VOIDmode
 --- a/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
 +++ b/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
 @@ -1,6 +1,8 @@
@@ -1852,6 +4826,97 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  #pragma GCC visibility push(hidden)
  struct A { int &operator[] (long); };
  template <typename> struct B;
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.c-torture/compile/pr71295.c
+@@ -0,0 +1,12 @@
++extern void fn2 (long long);
++int a;
++
++void
++fn1 ()
++{
++  long long b[3];
++  a = 0;
++  for (; a < 3; a++)
++    b[a] = 1;
++  fn2 (b[1]);
++}
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.c-torture/execute/pr37780.c
+@@ -0,0 +1,49 @@
++/* PR middle-end/37780.  */
++
++#define VAL (8 * sizeof (int))
++
++int __attribute__ ((noinline, noclone))
++fooctz (int i)
++{
++  return (i == 0) ? VAL : __builtin_ctz (i);
++}
++
++int __attribute__ ((noinline, noclone))
++fooctz2 (int i)
++{
++  return (i != 0) ? __builtin_ctz (i) : VAL;
++}
++
++unsigned int __attribute__ ((noinline, noclone))
++fooctz3 (unsigned int i)
++{
++  return (i > 0) ?  __builtin_ctz (i) : VAL;
++}
++
++int __attribute__ ((noinline, noclone))
++fooclz (int i)
++{
++  return (i == 0) ? VAL : __builtin_clz (i);
++}
++
++int __attribute__ ((noinline, noclone))
++fooclz2 (int i)
++{
++  return (i != 0) ? __builtin_clz (i) : VAL;
++}
++
++unsigned int __attribute__ ((noinline, noclone))
++fooclz3 (unsigned int i)
++{
++  return (i > 0) ? __builtin_clz (i) : VAL;
++}
++
++int
++main (void)
++{
++  if (fooctz (0) != VAL || fooctz2 (0) != VAL || fooctz3 (0) != VAL
++      || fooclz (0) != VAL || fooclz2 (0) != VAL || fooclz3 (0) != VAL)
++    __builtin_abort ();
++
++  return 0;
++}
+\ No newline at end of file
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.c-torture/execute/pr66940.c
+@@ -0,0 +1,20 @@
++long long __attribute__ ((noinline, noclone))
++foo (long long ival)
++{
++ if (ival <= 0)
++    return -0x7fffffffffffffffL - 1;
++
++ return 0x7fffffffffffffffL;
++}
++
++int
++main (void)
++{
++  if (foo (-1) != (-0x7fffffffffffffffL - 1))
++    __builtin_abort ();
++
++  if (foo (1) != 0x7fffffffffffffffL)
++    __builtin_abort ();
++
++  return 0;
++}
 --- a/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
 +++ b/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
 @@ -87,6 +87,12 @@ foreach plugin_test $plugin_test_list {
@@ -1931,6 +4996,317 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 +
 +
 +
+--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-2.c
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-2.c
+@@ -25,6 +25,7 @@ f1 (int i, ...)
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -45,6 +46,7 @@ f2 (int i, ...)
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 8 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -60,6 +62,7 @@ f3 (int i, ...)
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 1 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 16 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[1-9\]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[1-9\]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
+@@ -78,6 +81,7 @@ f4 (int i, ...)
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -96,6 +100,7 @@ f5 (int i, ...)
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -116,6 +121,7 @@ f6 (int i, ...)
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 3 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -133,6 +139,7 @@ f7 (int i, ...)
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -152,6 +159,7 @@ f8 (int i, ...)
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -169,6 +177,7 @@ f9 (int i, ...)
+ /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -188,6 +197,7 @@ f10 (int i, ...)
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -208,6 +218,7 @@ f11 (int i, ...)
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 3 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -228,6 +239,7 @@ f12 (int i, ...)
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 24 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and 3 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and 48 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -248,6 +260,7 @@ f13 (int i, ...)
+ /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 24 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and 3 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and 48 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -268,6 +281,7 @@ f14 (int i, ...)
+ /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 24 GPR units and 3" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 1 GPR units and 2 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -291,6 +305,7 @@ f15 (int i, ...)
+ /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save 1 GPR units and 2 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f15: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ /* We may be able to improve upon this after fixing PR66010/PR66013.  */
+ /* { dg-final { scan-tree-dump "f15: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-3.c
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-3.c
+@@ -24,6 +24,7 @@ f1 (int i, ...)
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -39,6 +40,7 @@ f2 (int i, ...)
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -57,6 +59,7 @@ f3 (int i, ...)
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -73,6 +76,7 @@ f4 (int i, ...)
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -89,6 +93,7 @@ f5 (int i, ...)
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -107,6 +112,7 @@ f6 (int i, ...)
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -123,6 +129,7 @@ f7 (int i, ...)
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -139,6 +146,7 @@ f8 (int i, ...)
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -155,6 +163,7 @@ f10 (int i, ...)
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -171,6 +180,7 @@ f11 (int i, ...)
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -187,6 +197,7 @@ f12 (int i, ...)
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-4.c
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-4.c
+@@ -27,6 +27,7 @@ f1 (int i, ...)
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -44,6 +45,7 @@ f2 (int i, ...)
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -67,6 +69,7 @@ f3 (int i, ...)
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+@@ -88,6 +91,7 @@ f4 (int i, ...)
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 8 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and 1 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and 16 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
+--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-5.c
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-5.c
+@@ -25,6 +25,7 @@ f1 (int i, ...)
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ void
+ f2 (int i, ...)
+@@ -38,6 +39,7 @@ f2 (int i, ...)
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and all FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ /* Here va_arg can be executed at most as many times as va_start.  */
+ void
+@@ -56,6 +58,7 @@ f3 (int i, ...)
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 32 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ void
+ f4 (int i, ...)
+@@ -74,6 +77,7 @@ f4 (int i, ...)
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 16 GPR units and 16 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 2 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ void
+ f5 (int i, ...)
+@@ -88,6 +92,7 @@ f5 (int i, ...)
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 16 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 32 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save (4|2) GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 16 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ void
+ f6 (int i, ...)
+@@ -102,6 +107,7 @@ f6 (int i, ...)
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 32 GPR units and 3" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|2) GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target aarch64*-*-* } } } */
+ 
+ void
+ f7 (int i, ...)
+@@ -116,3 +122,4 @@ f7 (int i, ...)
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 0 GPR units and 64 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 32 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 2 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 0 GPR units and 64 FPR units" "stdarg" { target aarch64*-*-* } } } */
+--- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-6.c
++++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-6.c
+@@ -30,6 +30,7 @@ bar (int x, char const *y, ...)
+ /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
+ /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
+ /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
++/* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
+ /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
+ /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
+ /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.dg/vect/pr57206.c
 @@ -0,0 +1,11 @@
@@ -4900,6 +8276,71 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
  
  
+--- a/src/gcc/testsuite/gcc.target/aarch64/fmovd-zero-reg.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/fmovd-zero-reg.c
+@@ -8,4 +8,4 @@ foo (void)
+   bar (0.0);
+ }
+ 
+-/* { dg-final { scan-assembler "fmov\\td0, xzr" } } */
++/* { dg-final { scan-assembler "movi\\td0, #0" } } */
+--- a/src/gcc/testsuite/gcc.target/aarch64/fmovf-zero-reg.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/fmovf-zero-reg.c
+@@ -8,4 +8,4 @@ foo (void)
+   bar (0.0);
+ }
+ 
+-/* { dg-final { scan-assembler "fmov\\ts0, wzr" } } */
++/* { dg-final { scan-assembler "movi\\tv0\.2s, #0" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/pr37780_1.c
+@@ -0,0 +1,46 @@
++/* Test that we can remove the conditional move due to CLZ
++   and CTZ being defined at zero.  */
++
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++int
++fooctz (int i)
++{
++  return (i == 0) ? 32 : __builtin_ctz (i);
++}
++
++int
++fooctz2 (int i)
++{
++  return (i != 0) ? __builtin_ctz (i) : 32;
++}
++
++unsigned int
++fooctz3 (unsigned int i)
++{
++  return (i > 0) ?  __builtin_ctz (i) : 32;
++}
++
++/* { dg-final { scan-assembler-times "rbit\t*" 3 } } */
++
++int
++fooclz (int i)
++{
++  return (i == 0) ? 32 : __builtin_clz (i);
++}
++
++int
++fooclz2 (int i)
++{
++  return (i != 0) ? __builtin_clz (i) : 32;
++}
++
++unsigned int
++fooclz3 (unsigned int i)
++{
++  return (i > 0) ? __builtin_clz (i) : 32;
++}
++
++/* { dg-final { scan-assembler-times "clz\t" 6 } } */
++/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
 --- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
 @@ -0,0 +1,541 @@
@@ -5479,6 +8920,70 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 +
 +/* { dg-final { scan-assembler-not "mov\tx0, x8" } } */
 --- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 --save-temps" } */
++
++int
++f (int a, ...)
++{
++  /* { dg-final { scan-assembler-not "str" } } */
++  return a;
++}
++
++/* { dg-final { cleanup-saved-temps } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_2.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 --save-temps" } */
++
++int
++foo (char *fmt, ...)
++{
++  int d;
++  __builtin_va_list ap;
++
++  __builtin_va_start (ap, fmt);
++  d = __builtin_va_arg (ap, int);
++  __builtin_va_end (ap);
++
++  /* { dg-final { scan-assembler-not "x7" } } */
++  return d;
++}
++
++/* { dg-final { cleanup-saved-temps } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_3.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 --save-temps" } */
++
++int d2i (double a);
++
++int
++foo (char *fmt, ...)
++{
++  int d, e;
++  double f, g;
++  __builtin_va_list ap;
++
++  __builtin_va_start (ap, fmt);
++  d = __builtin_va_arg (ap, int);
++  f = __builtin_va_arg (ap, double);
++  g = __builtin_va_arg (ap, double);
++  d += d2i (f);
++  d += d2i (g);
++  __builtin_va_end (ap);
++
++  /* { dg-final { scan-assembler-not "x7" } } */
++  /* { dg-final { scan-assembler-not "q7" } } */
++  return d;
++}
++
++/* { dg-final { cleanup-saved-temps } } */
+--- /dev/null
 +++ b/src/gcc/testsuite/gcc.target/arm/armv5_thumb_isa.c
 @@ -0,0 +1,8 @@
 +/* { dg-require-effective-target arm_arch_v5_ok } */
@@ -5489,6 +8994,164 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
 +#endif
 +
 +int foo;
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_neon_ok } */
++/* { dg-options "-O3" } */
++/* { dg-add-options arm_neon } */
++
++
++
++int
++t6 (int len, void * dummy, short * __restrict x)
++{
++  len = len & ~31;
++  int result = 0;
++  __asm volatile ("");
++  for (int i = 0; i < len; i++)
++    result += x[i];
++  return result;
++}
++
++/* { dg-final { scan-assembler "vaddw\.s16" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_neon_ok } */
++/* { dg-options "-O3" } */
++/* { dg-add-options arm_neon } */
++
++
++int
++t6 (int len, void * dummy, int * __restrict x)
++{
++  len = len & ~31;
++  long long result = 0;
++  __asm volatile ("");
++  for (int i = 0; i < len; i++)
++    result += x[i];
++  return result;
++}
++
++/* { dg-final { scan-assembler "vaddw\.s32" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_neon_ok } */
++/* { dg-options "-O3" } */
++/* { dg-add-options arm_neon } */
++
++
++int
++t6 (int len, void * dummy, unsigned short * __restrict x)
++{
++  len = len & ~31;
++  unsigned int result = 0;
++  __asm volatile ("");
++  for (int i = 0; i < len; i++)
++    result += x[i];
++  return result;
++}
++
++/* { dg-final { scan-assembler "vaddw.u16" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_neon_ok } */
++/* { dg-options "-O3" } */
++/* { dg-add-options arm_neon } */
++
++
++int
++t6 (int len, void * dummy, unsigned int * __restrict x)
++{
++  len = len & ~31;
++  unsigned long long result = 0;
++  __asm volatile ("");
++  for (int i = 0; i < len; i++)
++    result += x[i];
++  return result;
++}
++
++/* { dg-final { scan-assembler "vaddw\.u32" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_neon_ok } */
++/* { dg-options "-O3" } */
++/* { dg-add-options arm_neon } */
++
++
++
++int
++t6 (int len, void * dummy, char * __restrict x)
++{
++  len = len & ~31;
++  unsigned short result = 0;
++  __asm volatile ("");
++  for (int i = 0; i < len; i++)
++    result += x[i];
++  return result;
++}
++
++/* { dg-final { scan-assembler "vaddw\.u8" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/pr37780_1.c
+@@ -0,0 +1,48 @@
++/* Test that we can remove the conditional move due to CLZ
++   being defined at zero.  */
++
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_arch_v6t2_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_arch_v6t2 } */
++
++int
++fooctz (int i)
++{
++  return (i == 0) ? 32 : __builtin_ctz (i);
++}
++
++int
++fooctz2 (int i)
++{
++  return (i != 0) ? __builtin_ctz (i) : 32;
++}
++
++unsigned int
++fooctz3 (unsigned int i)
++{
++  return (i > 0) ?  __builtin_ctz (i) : 32;
++}
++
++/* { dg-final { scan-assembler-times "rbit\t*" 3 } } */
++
++int
++fooclz (int i)
++{
++  return (i == 0) ? 32 : __builtin_clz (i);
++}
++
++int
++fooclz2 (int i)
++{
++  return (i != 0) ? __builtin_clz (i) : 32;
++}
++
++unsigned int
++fooclz3 (unsigned int i)
++{
++  return (i > 0) ? __builtin_clz (i) : 32;
++}
++
++/* { dg-final { scan-assembler-times "clz\t" 6 } } */
++/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
 --- a/src/gcc/testsuite/lib/gcc-dg.exp
 +++ b/src/gcc/testsuite/lib/gcc-dg.exp
 @@ -403,6 +403,7 @@ if { [info procs ${tool}_load] != [list] \
@@ -5499,6 +9162,17 @@ LANG=C git diff 45abd6d7c64044c2b09b971d3af06edfc5e1635e bb0c4802e3ecbe29b817501
  	    }
  	    set result [list $status [lindex $result 1]]
  	}
+--- a/src/gcc/testsuite/lib/target-supports.exp
++++ b/src/gcc/testsuite/lib/target-supports.exp
+@@ -4382,6 +4382,8 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
+         set et_vect_widen_sum_hi_to_si_pattern_saved 0
+         if { [istarget powerpc*-*-*]
+              || [istarget aarch64*-*-*]
++	     || ([istarget arm*-*-*] &&
++		 [check_effective_target_arm_neon_ok])
+              || [istarget ia64-*-*] } {
+             set et_vect_widen_sum_hi_to_si_pattern_saved 1
+         }
 --- a/src/gcc/tree-scalar-evolution.c
 +++ b/src/gcc/tree-scalar-evolution.c
 @@ -1937,6 +1937,36 @@ interpret_rhs_expr (struct loop *loop, gimple *at_stmt,
diff --git a/debian/patches/vulcan-costs.diff b/debian/patches/vulcan-costs.diff
deleted file mode 100644
index ca5c90b..0000000
--- a/debian/patches/vulcan-costs.diff
+++ /dev/null
@@ -1,259 +0,0 @@
-# DP: Add cost model for vulcan CPU
-
-From: jgreenhalgh <jgreenhalgh at 138bc75d-0d04-0410-961f-82ee72b054a4>
-Date: Fri, 15 Jul 2016 11:17:53 +0000
-Subject: [PATCH] [PATCH/AARCH64] Add rtx_costs routine for vulcan.
-
-gcc/ChangeLog:
-
-2016-07-15  Virendra Pathak  <virendra.pathak at broadcom.com>
-	    Julian Brown  <julian at codesourcery.com>
-
-	* config/aarch64/aarch64-cores.def: Update vulcan COSTS.
-	* config/aarch64/aarch64-cost-tables.h
-	(vulcan_extra_costs): New variable.
-	* config/aarch64/aarch64.c
-	(vulcan_addrcost_table): Likewise.
-	(vulcan_regmove_cost): Likewise.
-	(vulcan_vector_cost): Likewise.
-	(vulcan_branch_cost): Likewise.
-	(vulcan_tunings): Likewise.
-
-[dannf: backported by removing approx_modes function pointer]
-
-diff -urpN a/src/gcc/config/aarch64/aarch64.c b/src/gcc/config/aarch64/aarch64.c
---- a/src/gcc/config/aarch64/aarch64.c	2016-07-15 16:14:24.268328586 +0000
-+++ b/src/gcc/config/aarch64/aarch64.c	2016-07-15 16:15:52.603299822 +0000
-@@ -250,6 +250,22 @@ static const struct cpu_addrcost_table x
-   0, /* imm_offset  */
- };
- 
-+static const struct cpu_addrcost_table vulcan_addrcost_table =
-+{
-+    {
-+      0, /* hi  */
-+      0, /* si  */
-+      0, /* di  */
-+      2, /* ti  */
-+    },
-+  0, /* pre_modify  */
-+  0, /* post_modify  */
-+  2, /* register_offset  */
-+  3, /* register_sextend  */
-+  3, /* register_zextend  */
-+  0, /* imm_offset  */
-+};
-+
- static const struct cpu_regmove_cost generic_regmove_cost =
- {
-   1, /* GP2GP  */
-@@ -308,6 +324,15 @@ static const struct cpu_regmove_cost xge
-   2 /* FP2FP  */
- };
- 
-+static const struct cpu_regmove_cost vulcan_regmove_cost =
-+{
-+  1, /* GP2GP  */
-+  /* Avoid the use of int<->fp moves for spilling.  */
-+  8, /* GP2FP  */
-+  8, /* FP2GP  */
-+  4  /* FP2FP  */
-+};
-+
- /* Generic costs for vector insn classes.  */
- static const struct cpu_vector_cost generic_vector_cost =
- {
-@@ -379,6 +404,24 @@ static const struct cpu_vector_cost xgen
-   1 /* cond_not_taken_branch_cost  */
- };
- 
-+/* Costs for vector insn classes for Vulcan.  */
-+static const struct cpu_vector_cost vulcan_vector_cost =
-+{
-+  6, /* scalar_stmt_cost  */
-+  4, /* scalar_load_cost  */
-+  1, /* scalar_store_cost  */
-+  6, /* vec_stmt_cost  */
-+  3, /* vec_permute_cost  */
-+  6, /* vec_to_scalar_cost  */
-+  5, /* scalar_to_vec_cost  */
-+  8, /* vec_align_load_cost  */
-+  8, /* vec_unalign_load_cost  */
-+  4, /* vec_unalign_store_cost  */
-+  4, /* vec_store_cost  */
-+  2, /* cond_taken_branch_cost  */
-+  1  /* cond_not_taken_branch_cost  */
-+};
-+
- /* Generic costs for branch instructions.  */
- static const struct cpu_branch_cost generic_branch_cost =
- {
-@@ -393,6 +436,13 @@ static const struct cpu_branch_cost cort
-   3   /* Unpredictable.  */
- };
- 
-+/* Branch costs for Vulcan.  */
-+static const struct cpu_branch_cost vulcan_branch_cost =
-+{
-+  1,  /* Predictable.  */
-+  3   /* Unpredictable.  */
-+};
-+
- static const struct tune_params generic_tunings =
- {
-   &cortexa57_extra_costs,
-@@ -589,6 +639,30 @@ static const struct tune_params xgene1_t
-   (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
- };
- 
-+static const struct tune_params vulcan_tunings =
-+{
-+  &vulcan_extra_costs,
-+  &vulcan_addrcost_table,
-+  &vulcan_regmove_cost,
-+  &vulcan_vector_cost,
-+  &vulcan_branch_cost,
-+  4, /* memmov_cost.  */
-+  4, /* issue_rate.  */
-+  AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
-+  16,	/* function_align.  */
-+  8,	/* jump_align.  */
-+  16,	/* loop_align.  */
-+  3,	/* int_reassoc_width.  */
-+  2,	/* fp_reassoc_width.  */
-+  2,	/* vec_reassoc_width.  */
-+  2,	/* min_div_recip_mul_sf.  */
-+  2,	/* min_div_recip_mul_df.  */
-+  0,	/* max_case_values.  */
-+  0,	/* cache_line_size.  */
-+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
-+};
-+
- /* Support for fine-grained override of the tuning structures.  */
- struct aarch64_tuning_override_function
- {
-diff -urpN a/src/gcc/config/aarch64/aarch64-cores.def b/src/gcc/config/aarch64/aarch64-cores.def
---- a/src/gcc/config/aarch64/aarch64-cores.def	2016-07-15 16:14:24.272328721 +0000
-+++ b/src/gcc/config/aarch64/aarch64-cores.def	2016-07-15 16:15:26.730430056 +0000
-@@ -51,7 +51,7 @@ AARCH64_CORE("xgene1",      xgene1,    x
- 
- /* V8.1 Architecture Processors.  */
- 
--AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, cortexa57, "0x42", "0x516")
-+AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, vulcan, "0x42", "0x516")
- 
- /* V8 big.LITTLE implementations.  */
- 
-diff -urpN a/src/gcc/config/aarch64/aarch64-cost-tables.h b/src/gcc/config/aarch64/aarch64-cost-tables.h
---- a/src/gcc/config/aarch64/aarch64-cost-tables.h	2016-07-15 16:14:24.272328721 +0000
-+++ b/src/gcc/config/aarch64/aarch64-cost-tables.h	2016-07-15 16:15:26.730430056 +0000
-@@ -127,6 +127,108 @@ const struct cpu_cost_table thunderx_ext
-   }
- };
- 
-+const struct cpu_cost_table vulcan_extra_costs =
-+{
-+  /* ALU */
-+  {
-+    0,			/* Arith.  */
-+    0,			/* Logical.  */
-+    0,			/* Shift.  */
-+    0,			/* Shift_reg.  */
-+    COSTS_N_INSNS (1),	/* Arith_shift.  */
-+    COSTS_N_INSNS (1),	/* Arith_shift_reg.  */
-+    COSTS_N_INSNS (1),	/* Log_shift.  */
-+    COSTS_N_INSNS (1),	/* Log_shift_reg.  */
-+    0,			/* Extend.  */
-+    COSTS_N_INSNS (1),	/* Extend_arith.  */
-+    0,			/* Bfi.  */
-+    0,			/* Bfx.  */
-+    COSTS_N_INSNS (3),	/* Clz.  */
-+    0,			/* Rev.  */
-+    0,			/* Non_exec.  */
-+    true		/* Non_exec_costs_exec.  */
-+  },
-+  {
-+    /* MULT SImode */
-+    {
-+      COSTS_N_INSNS (4),	/* Simple.  */
-+      COSTS_N_INSNS (4),	/* Flag_setting.  */
-+      COSTS_N_INSNS (4),	/* Extend.  */
-+      COSTS_N_INSNS (5),	/* Add.  */
-+      COSTS_N_INSNS (5),	/* Extend_add.  */
-+      COSTS_N_INSNS (18)	/* Idiv.  */
-+    },
-+    /* MULT DImode */
-+    {
-+      COSTS_N_INSNS (4),       /* Simple.  */
-+      0,                       /* Flag_setting.  */
-+      COSTS_N_INSNS (4),       /* Extend.  */
-+      COSTS_N_INSNS (5),       /* Add.  */
-+      COSTS_N_INSNS (5),       /* Extend_add.  */
-+      COSTS_N_INSNS (26)       /* Idiv.  */
-+    }
-+  },
-+  /* LD/ST */
-+  {
-+    COSTS_N_INSNS (4),	/* Load.  */
-+    COSTS_N_INSNS (4),	/* Load_sign_extend.  */
-+    COSTS_N_INSNS (5),	/* Ldrd.  */
-+    COSTS_N_INSNS (4),	/* Ldm_1st.  */
-+    1,			/* Ldm_regs_per_insn_1st.  */
-+    1,			/* Ldm_regs_per_insn_subsequent.  */
-+    COSTS_N_INSNS (4),	/* Loadf.  */
-+    COSTS_N_INSNS (4),	/* Loadd.  */
-+    COSTS_N_INSNS (4),	/* Load_unaligned.  */
-+    0,			/* Store.  */
-+    0,			/* Strd.  */
-+    0,			/* Stm_1st.  */
-+    1,			/* Stm_regs_per_insn_1st.  */
-+    1,			/* Stm_regs_per_insn_subsequent.  */
-+    0,			/* Storef.  */
-+    0,			/* Stored.  */
-+    0,			/* Store_unaligned.  */
-+    COSTS_N_INSNS (1),	/* Loadv.  */
-+    COSTS_N_INSNS (1)	/* Storev.  */
-+  },
-+  {
-+    /* FP SFmode */
-+    {
-+      COSTS_N_INSNS (4),	/* Div.  */
-+      COSTS_N_INSNS (1),	/* Mult.  */
-+      COSTS_N_INSNS (1),	/* Mult_addsub. */
-+      COSTS_N_INSNS (1),	/* Fma.  */
-+      COSTS_N_INSNS (1),	/* Addsub.  */
-+      COSTS_N_INSNS (1),	/* Fpconst. */
-+      COSTS_N_INSNS (1),	/* Neg.  */
-+      COSTS_N_INSNS (1),	/* Compare.  */
-+      COSTS_N_INSNS (2),	/* Widen.  */
-+      COSTS_N_INSNS (2),	/* Narrow.  */
-+      COSTS_N_INSNS (2),	/* Toint.  */
-+      COSTS_N_INSNS (2),	/* Fromint.  */
-+      COSTS_N_INSNS (2) 	/* Roundint.  */
-+    },
-+    /* FP DFmode */
-+    {
-+      COSTS_N_INSNS (6),	/* Div.  */
-+      COSTS_N_INSNS (1),	/* Mult.  */
-+      COSTS_N_INSNS (1),	/* Mult_addsub.  */
-+      COSTS_N_INSNS (1),	/* Fma.  */
-+      COSTS_N_INSNS (1),	/* Addsub.  */
-+      COSTS_N_INSNS (1),	/* Fpconst.  */
-+      COSTS_N_INSNS (1),	/* Neg.  */
-+      COSTS_N_INSNS (1),	/* Compare.  */
-+      COSTS_N_INSNS (2),	/* Widen.  */
-+      COSTS_N_INSNS (2),	/* Narrow.  */
-+      COSTS_N_INSNS (2),	/* Toint.  */
-+      COSTS_N_INSNS (2),	/* Fromint.  */
-+      COSTS_N_INSNS (2) 	/* Roundint.  */
-+    }
-+  },
-+  /* Vector */
-+  {
-+    COSTS_N_INSNS (1)	/* Alu.  */
-+  }
-+};
- 
- 
- #endif
diff --git a/debian/patches/vulcan-cpu-doc.diff b/debian/patches/vulcan-cpu-doc.diff
deleted file mode 100644
index 4656259..0000000
--- a/debian/patches/vulcan-cpu-doc.diff
+++ /dev/null
@@ -1,27 +0,0 @@
-# DP: Accept vulcan as a cpu name for the AArch64 port of GCC (documentation)
-
-From: jgreenhalgh <jgreenhalgh at 138bc75d-0d04-0410-961f-82ee72b054a4>
-Date: Tue, 21 Jun 2016 13:43:29 +0000 (+0000)
-Subject: [PATCH/AARCH64] Accept vulcan as a cpu name for the AArch64 port of GCC
-X-Git-Url: https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff_plain;h=2c6ac78145ac8ff2fd83271d093e23ab80a15e4f
-
-[PATCH/AARCH64] Accept vulcan as a cpu name for the AArch64 port of GCC
-
-gcc/ChangeLog
-
-	* config/aarch64/aarch64-cores.def (vulcan): New core.
-	* config/aarch64/aarch64-tune.md: Regenerate.
-	* doc/invoke.texi: Document vulcan as an available option.
-
-diff -urpN a/src/gcc/doc/invoke.texi b/src/gcc/doc/invoke.texi
---- a/src/gcc/doc/invoke.texi	2016-06-21 10:31:29.994143994 -0600
-+++ b/src/gcc/doc/invoke.texi	2016-06-21 10:35:51.136081208 -0600
-@@ -12988,7 +12988,7 @@ Specify the name of the target processor
- performance of the code.  Permissible values for this option are:
- @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a57},
- @samp{cortex-a72}, @samp{exynos-m1}, @samp{qdf24xx}, @samp{thunderx},
-- at samp{xgene1}.
-+ at samp{vulcan}, @samp{xgene1}.
- 
- Additionally, this option can specify that GCC should tune the performance
- of the code for a big.LITTLE system.  Permissible values for this
diff --git a/debian/patches/vulcan-cpu.diff b/debian/patches/vulcan-cpu.diff
deleted file mode 100644
index 29edebe..0000000
--- a/debian/patches/vulcan-cpu.diff
+++ /dev/null
@@ -1,39 +0,0 @@
-# DP: Accept vulcan as a cpu name for the AArch64 port of GCC
-
-From: jgreenhalgh <jgreenhalgh at 138bc75d-0d04-0410-961f-82ee72b054a4>
-Date: Tue, 21 Jun 2016 13:43:29 +0000 (+0000)
-Subject: [PATCH/AARCH64] Accept vulcan as a cpu name for the AArch64 port of GCC
-X-Git-Url: https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff_plain;h=2c6ac78145ac8ff2fd83271d093e23ab80a15e4f
-
-[PATCH/AARCH64] Accept vulcan as a cpu name for the AArch64 port of GCC
-
-gcc/ChangeLog
-
-	* config/aarch64/aarch64-cores.def (vulcan): New core.
-	* config/aarch64/aarch64-tune.md: Regenerate.
-	* doc/invoke.texi: Document vulcan as an available option.
-
-diff -urpN a/src/gcc/config/aarch64/aarch64-cores.def b/src/gcc/config/aarch64/aarch64-cores.def
---- a/src/gcc/config/aarch64/aarch64-cores.def	2016-01-04 07:30:50.000000000 -0700
-+++ b/src/gcc/config/aarch64/aarch64-cores.def	2016-06-21 10:32:59.191974071 -0600
-@@ -49,6 +49,10 @@ AARCH64_CORE("qdf24xx",     qdf24xx,   c
- AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
- AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
- 
-+/* V8.1 Architecture Processors.  */
-+
-+AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, cortexa57, "0x42", "0x516")
-+
- /* V8 big.LITTLE implementations.  */
- 
- AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07.0xd03")
-diff -urpN a/src/gcc/config/aarch64/aarch64-tune.md b/src/gcc/config/aarch64/aarch64-tune.md
---- a/src/gcc/config/aarch64/aarch64-tune.md	2016-04-27 02:22:11.000000000 -0600
-+++ b/src/gcc/config/aarch64/aarch64-tune.md	2016-06-21 10:32:59.191974071 -0600
-@@ -1,5 +1,5 @@
- ;; -*- buffer-read-only: t -*-
- ;; Generated automatically by gentune.sh from aarch64-cores.def
- (define_attr "tune"
--	"cortexa35,cortexa53,cortexa57,cortexa72,exynosm1,qdf24xx,thunderx,xgene1,cortexa57cortexa53,cortexa72cortexa53"
-+	"cortexa35,cortexa53,cortexa57,cortexa72,exynosm1,qdf24xx,thunderx,xgene1,vulcan,cortexa57cortexa53,cortexa72cortexa53"
- 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/debian/rules.patch b/debian/rules.patch
index b7f9161..a9010b5 100644
--- a/debian/rules.patch
+++ b/debian/rules.patch
@@ -30,7 +30,6 @@ ifneq ($(GFDL_INVARIANT_FREE),yes)
 	rename-info-files \
 	gcc-SOURCE_DATE_EPOCH-doc \
 	gcc-SOURCE_DATE_EPOCH-2-doc \
-	vulcan-cpu-doc \
 
 #	svn-doc-updates \
 #	$(if $(with_linaro_branch),,svn-doc-updates) \
@@ -85,8 +84,6 @@ debian_patches += \
 	gcc-SOURCE_DATE_EPOCH \
 	gcc-SOURCE_DATE_EPOCH-2 \
 	cmd-go-combine-gccgo-s-ld-and-ldShared-methods \
-	vulcan-cpu \
-	vulcan-costs \
 	libjava-mips64el \
 	PR55947-revert \
 	gccgo-issue16780 \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/gcc-6.git