[gcc-6] 373/401: * Update to SVN 20170211 (r245353) from the gcc-6-branch. * Update the Linaro support to the 6.3-2017.02 snapshot.
Ximin Luo
infinity0 at debian.org
Wed Apr 5 15:50:40 UTC 2017
This is an automated email from the git hooks/post-receive script.
infinity0 pushed a commit to branch pu/reproducible_builds
in repository gcc-6.
commit a16203e0452be24968ae36015c0b578dc7d039ba
Author: doko <doko at 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca>
Date: Sat Feb 11 01:11:34 2017 +0000
* Update to SVN 20170211 (r245353) from the gcc-6-branch.
* Update the Linaro support to the 6.3-2017.02 snapshot.
git-svn-id: svn://anonscm.debian.org/gcccvs/branches/sid/gcc-6@9297 6ca36cf4-e1d1-0310-8c6f-e303bb2178ca
---
debian/changelog | 14 +-
debian/patches/gcc-linaro-doc.diff | 2 +-
debian/patches/gcc-linaro-no-macros.diff | 2 +-
debian/patches/gcc-linaro-r244242-revert.diff | 184 -
debian/patches/gcc-linaro-r244724-revert.diff | 246 -
debian/patches/gcc-linaro.diff | 24734 ++++++++++++++++++++++--
debian/patches/svn-updates.diff | 389 +-
debian/rules.patch | 2 -
8 files changed, 23439 insertions(+), 2134 deletions(-)
diff --git a/debian/changelog b/debian/changelog
index 87756d8..c2c8a0b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,9 +1,19 @@
+gcc-6 (6.3.0-7) UNRELEASED; urgency=medium
+
+ * Update to SVN 20170211 (r245353) from the gcc-6-branch.
+ - Fix PR target/78945 (ARM), PR translation/79397,
+ PR tree-optimization/71824, PR tree-optimization/71824,
+ PR tree-optimization/77318, PR target/71017 (x86), PR c++/78897.
+ * Update the Linaro support to the 6.3-2017.02 snapshot.
+
+ -- Matthias Klose <doko at ubuntu.com> Sat, 11 Feb 2017 01:49:22 +0100
+
gcc-6 (6.3.0-6) unstable; urgency=medium
* Update to SVN 20170205 (r245197) from the gcc-6-branch.
- Fix PR libstdc++/78346, PR libstdc++/79195, PR libstdc++/79254,
- PR target/78478, PR target/79268 (PPC), PR tree-optimization/79034,
- PR middle-end/78742, PR target/77439 (ARM32), PR c++/79176,
+ PR target/78478, PR target/79268 (PPC, LP: #1661051), PR c++/79176,
+ PR middle-end/78742, PR target/77439 (ARM32), PR tree-optimization/79034,
PR fortran/70697, PR fortran/70696, PR fortran/79305, PR go/79037,
PR go/79281 (closes: #853223), PR target/78862 (tilegx), PR lto/79061,
PR target/65484 (PPC).
diff --git a/debian/patches/gcc-linaro-doc.diff b/debian/patches/gcc-linaro-doc.diff
index 6b9f8a1..6ee9e58 100644
--- a/debian/patches/gcc-linaro-doc.diff
+++ b/debian/patches/gcc-linaro-doc.diff
@@ -1,4 +1,4 @@
-# DP: Changes for the Linaro 6-2017.01 release (documentation).
+# DP: Changes for the Linaro 6-2017.02 release (documentation).
--- a/src/gcc/doc/cpp.texi
+++ b/src/gcc/doc/cpp.texi
diff --git a/debian/patches/gcc-linaro-no-macros.diff b/debian/patches/gcc-linaro-no-macros.diff
index 8755be9..856e9d9 100644
--- a/debian/patches/gcc-linaro-no-macros.diff
+++ b/debian/patches/gcc-linaro-no-macros.diff
@@ -89,7 +89,7 @@ Index: b/src/gcc/LINARO-VERSION
--- a/src/gcc/LINARO-VERSION
+++ /dev/null
@@ -1,1 +0,0 @@
--Snapshot 6.3-2017.01
+-6.3-2017.02~dev
Index: b/src/gcc/configure.ac
===================================================================
--- a/src/gcc/configure.ac
diff --git a/debian/patches/gcc-linaro-r244242-revert.diff b/debian/patches/gcc-linaro-r244242-revert.diff
deleted file mode 100644
index 4f2a0e0..0000000
--- a/debian/patches/gcc-linaro-r244242-revert.diff
+++ /dev/null
@@ -1,184 +0,0 @@
-Index: configure.ac
-===================================================================
---- a/src/configure.ac (revision 244242)
-+++ a/src/configure.ac (revision 244241)
-@@ -819,9 +819,6 @@
- *-*-vxworks*)
- noconfigdirs="$noconfigdirs ${libgcj}"
- ;;
-- aarch64*-*-freebsd*)
-- noconfigdirs="$noconfigdirs ${libgcj}"
-- ;;
- alpha*-*-*vms*)
- noconfigdirs="$noconfigdirs ${libgcj}"
- ;;
-Index: libgcc/config.host
-===================================================================
---- a/src/libgcc/config.host (revision 244242)
-+++ a/src/libgcc/config.host (revision 244241)
-@@ -333,11 +333,6 @@
- tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
- tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
- ;;
--aarch64*-*-freebsd*)
-- extra_parts="$extra_parts crtfastmath.o"
-- tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
-- tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
-- ;;
- aarch64*-*-linux*)
- extra_parts="$extra_parts crtfastmath.o"
- md_unwind_header=aarch64/linux-unwind.h
-Index: gcc/config.gcc
-===================================================================
---- a/src/gcc/config.gcc (revision 244242)
-+++ a/src/gcc/config.gcc (revision 244241)
-@@ -946,11 +946,6 @@
- done
- TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
- ;;
--aarch64*-*-freebsd*)
-- tm_file="${tm_file} dbxelf.h elfos.h ${fbsd_tm_file}"
-- tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-freebsd.h"
-- tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-freebsd"
-- ;;
- aarch64*-*-linux*)
- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
- tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
-Index: gcc/config.host
-===================================================================
---- a/src/gcc/config.host (revision 244242)
-+++ a/src/gcc/config.host (revision 244241)
-@@ -99,7 +99,7 @@
- esac
-
- case ${host} in
-- aarch64*-*-freebsd* | aarch64*-*-linux*)
-+ aarch64*-*-linux*)
- case ${target} in
- aarch64*-*-*)
- host_extra_gcc_objs="driver-aarch64.o"
-Index: gcc/config/aarch64/t-aarch64-freebsd
-===================================================================
---- a/src/gcc/config/aarch64/t-aarch64-freebsd (revision 244242)
-+++ a/src/gcc/config/aarch64/t-aarch64-freebsd (nonexistent)
-@@ -1,21 +0,0 @@
--# Machine description for AArch64 architecture.
--# Copyright (C) 2016-2017 Free Software Foundation, Inc.
--#
--# This file is part of GCC.
--#
--# GCC is free software; you can redistribute it and/or modify it
--# under the terms of the GNU General Public License as published by
--# the Free Software Foundation; either version 3, or (at your option)
--# any later version.
--#
--# GCC is distributed in the hope that it will be useful, but
--# WITHOUT ANY WARRANTY; without even the implied warranty of
--# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
--# General Public License for more details.
--#
--# You should have received a copy of the GNU General Public License
--# along with GCC; see the file COPYING3. If not see
--# <http://www.gnu.org/licenses/>.
--
--LIB1ASMSRC = aarch64/lib1funcs.asm
--LIB1ASMFUNCS = _aarch64_sync_cache_range
-Index: gcc/config/aarch64/aarch64-freebsd.h
-===================================================================
---- a/src/gcc/config/aarch64/aarch64-freebsd.h (revision 244242)
-+++ a/src/gcc/config/aarch64/aarch64-freebsd.h (nonexistent)
-@@ -1,94 +0,0 @@
--/* Definitions for AArch64 running FreeBSD
-- Copyright (C) 2016-2017 Free Software Foundation, Inc.
--
-- This file is part of GCC.
--
-- GCC is free software; you can redistribute it and/or modify it
-- under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 3, or (at your option)
-- any later version.
--
-- GCC is distributed in the hope that it will be useful, but
-- WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-- General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with GCC; see the file COPYING3. If not see
-- <http://www.gnu.org/licenses/>. */
--
--#ifndef GCC_AARCH64_FREEBSD_H
--#define GCC_AARCH64_FREEBSD_H
--
--#undef SUBTARGET_CPP_SPEC
--#define SUBTARGET_CPP_SPEC FBSD_CPP_SPEC
--
--#if TARGET_BIG_ENDIAN_DEFAULT
--#define TARGET_LINKER_EMULATION "aarch64fbsdb"
--#else
--#define TARGET_LINKER_EMULATION "aarch64fbsd"
--#endif
--
--#undef SUBTARGET_EXTRA_LINK_SPEC
--#define SUBTARGET_EXTRA_LINK_SPEC " -m" TARGET_LINKER_EMULATION
--
--#undef FBSD_TARGET_LINK_SPEC
--#define FBSD_TARGET_LINK_SPEC " \
-- %{p:%nconsider using `-pg' instead of `-p' with gprof (1) } \
-- %{v:-V} \
-- %{assert*} %{R*} %{rpath*} %{defsym*} \
-- %{shared:-Bshareable %{h*} %{soname*}} \
-- %{symbolic:-Bsymbolic} \
-- %{static:-Bstatic} \
-- %{!static: \
-- %{rdynamic:-export-dynamic} \
-- %{!shared:-dynamic-linker " FBSD_DYNAMIC_LINKER " }} \
-- -X" SUBTARGET_EXTRA_LINK_SPEC " \
-- %{mbig-endian:-EB} %{mlittle-endian:-EL}"
--
--#if TARGET_FIX_ERR_A53_835769_DEFAULT
--#define CA53_ERR_835769_SPEC \
-- " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
--#else
--#define CA53_ERR_835769_SPEC \
-- " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
--#endif
--
--#ifdef TARGET_FIX_ERR_A53_843419_DEFAULT
--#define CA53_ERR_843419_SPEC \
-- " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
--#else
--#define CA53_ERR_843419_SPEC \
-- " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
--#endif
--
--#undef LINK_SPEC
--#define LINK_SPEC FBSD_TARGET_LINK_SPEC \
-- CA53_ERR_835769_SPEC \
-- CA53_ERR_843419_SPEC
--
--#define GNU_USER_TARGET_MATHFILE_SPEC \
-- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
--
--#undef ENDFILE_SPEC
--#define ENDFILE_SPEC \
-- GNU_USER_TARGET_MATHFILE_SPEC " " \
-- FBSD_ENDFILE_SPEC
--
--#undef TARGET_OS_CPP_BUILTINS
--#define TARGET_OS_CPP_BUILTINS() \
-- do \
-- { \
-- FBSD_TARGET_OS_CPP_BUILTINS (); \
-- } \
-- while (false)
--
--#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
--
--/* Uninitialized common symbols in non-PIE executables, even with
-- strong definitions in dependent shared libraries, will resolve
-- to COPY relocated symbol in the executable. See PR65780. */
--#undef TARGET_BINDS_LOCAL_P
--#define TARGET_BINDS_LOCAL_P default_binds_local_p_2
--
--#endif /* GCC_AARCH64_FREEBSD_H */
diff --git a/debian/patches/gcc-linaro-r244724-revert.diff b/debian/patches/gcc-linaro-r244724-revert.diff
deleted file mode 100644
index af336e9..0000000
--- a/debian/patches/gcc-linaro-r244724-revert.diff
+++ /dev/null
@@ -1,246 +0,0 @@
-# DP: Revert PR PR target/77455, already in the Linaro branch
-
---- a/src/gcc/testsuite/gcc.target/aarch64/eh_return.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/eh_return.c
-@@ -1,82 +0,0 @@
--/* { dg-do run } */
--/* { dg-options "-O2 -fno-inline" } */
--
--#include <stdlib.h>
--#include <stdio.h>
--
--int val, test, failed;
--
--int main (void);
--
--void
--eh0 (void *p)
--{
-- val = (int)(long)p & 7;
-- if (val)
-- abort ();
--}
--
--void
--eh1 (void *p, int x)
--{
-- void *q = __builtin_alloca (x);
-- eh0 (q);
-- __builtin_eh_return (0, p);
--}
--
--void
--eh2a (int a,int b,int c,int d,int e,int f,int g,int h, void *p)
--{
-- val = a + b + c + d + e + f + g + h + (int)(long)p & 7;
--}
--
--void
--eh2 (void *p)
--{
-- eh2a (val, val, val, val, val, val, val, val, p);
-- __builtin_eh_return (0, p);
--}
--
--
--void
--continuation (void)
--{
-- test++;
-- main ();
--}
--
--void
--fail (void)
--{
-- failed = 1;
-- printf ("failed\n");
-- continuation ();
--}
--
--void
--do_test1 (void)
--{
-- if (!val)
-- eh1 (continuation, 100);
-- fail ();
--}
--
--void
--do_test2 (void)
--{
-- if (!val)
-- eh2 (continuation);
-- fail ();
--}
--
--int
--main (void)
--{
-- if (test == 0)
-- do_test1 ();
-- if (test == 1)
-- do_test2 ();
-- if (failed || test != 2)
-- exit (1);
-- exit (0);
--}
---- a/src/gcc/config/aarch64/aarch64.md
-+++ b/src/gcc/config/aarch64/aarch64.md
-@@ -587,6 +587,25 @@
- [(set_attr "type" "branch")]
- )
-
-+(define_insn "eh_return"
-+ [(unspec_volatile [(match_operand:DI 0 "register_operand" "r")]
-+ UNSPECV_EH_RETURN)]
-+ ""
-+ "#"
-+ [(set_attr "type" "branch")]
-+
-+)
-+
-+(define_split
-+ [(unspec_volatile [(match_operand:DI 0 "register_operand" "")]
-+ UNSPECV_EH_RETURN)]
-+ "reload_completed"
-+ [(set (match_dup 1) (match_dup 0))]
-+ {
-+ operands[1] = aarch64_final_eh_return_addr ();
-+ }
-+)
-+
- (define_insn "*cb<optab><mode>1"
- [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
- (const_int 0))
---- a/src/gcc/config/aarch64/aarch64-protos.h
-+++ b/src/gcc/config/aarch64/aarch64-protos.h
-@@ -338,7 +338,7 @@
- int aarch64_simd_attr_length_move (rtx_insn *);
- int aarch64_uxt_size (int, HOST_WIDE_INT);
- int aarch64_vec_fpconst_pow_of_2 (rtx);
--rtx aarch64_eh_return_handler_rtx (void);
-+rtx aarch64_final_eh_return_addr (void);
- rtx aarch64_legitimize_reload_address (rtx *, machine_mode, int, int, int);
- rtx aarch64_mask_from_zextract_ops (rtx, rtx);
- const char *aarch64_output_move_struct (rtx *operands);
---- a/src/gcc/config/aarch64/aarch64.c
-+++ b/src/gcc/config/aarch64/aarch64.c
-@@ -2480,10 +2480,6 @@
- && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
- return true;
-
-- /* Force a frame pointer for EH returns so the return address is at FP+8. */
-- if (crtl->calls_eh_return)
-- return true;
--
- return false;
- }
-
-@@ -3040,8 +3036,7 @@
- rtx_insn *insn;
- /* We need to add memory barrier to prevent read from deallocated stack. */
- bool need_barrier_p = (get_frame_size () != 0
-- || cfun->machine->frame.saved_varargs_size
-- || crtl->calls_eh_return);
-+ || cfun->machine->frame.saved_varargs_size);
-
- aarch64_layout_frame ();
-
-@@ -3194,40 +3189,52 @@
- emit_jump_insn (ret_rtx);
- }
-
--/* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
-- normally or return to a previous frame after unwinding.
-+/* Return the place to copy the exception unwinding return address to.
-+ This will probably be a stack slot, but could (in theory be the
-+ return register). */
-+rtx
-+aarch64_final_eh_return_addr (void)
-+{
-+ HOST_WIDE_INT fp_offset;
-
-- An EH return uses a single shared return sequence. The epilogue is
-- exactly like a normal epilogue except that it has an extra input
-- register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
-- that must be applied after the frame has been destroyed. An extra label
-- is inserted before the epilogue which initializes this register to zero,
-- and this is the entry point for a normal return.
-+ aarch64_layout_frame ();
-
-- An actual EH return updates the return address, initializes the stack
-- adjustment and jumps directly into the epilogue (bypassing the zeroing
-- of the adjustment). Since the return address is typically saved on the
-- stack when a function makes a call, the saved LR must be updated outside
-- the epilogue.
-+ fp_offset = cfun->machine->frame.frame_size
-+ - cfun->machine->frame.hard_fp_offset;
-
-- This poses problems as the store is generated well before the epilogue,
-- so the offset of LR is not known yet. Also optimizations will remove the
-- store as it appears dead, even after the epilogue is generated (as the
-- base or offset for loading LR is different in many cases).
-+ if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
-+ return gen_rtx_REG (DImode, LR_REGNUM);
-
-- To avoid these problems this implementation forces the frame pointer
-- in eh_return functions so that the location of LR is fixed and known early.
-- It also marks the store volatile, so no optimization is permitted to
-- remove the store. */
--rtx
--aarch64_eh_return_handler_rtx (void)
--{
-- rtx tmp = gen_frame_mem (Pmode,
-- plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
-+ /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
-+ result in a store to save LR introduced by builtin_eh_return () being
-+ incorrectly deleted because the alias is not detected.
-+ So in the calculation of the address to copy the exception unwinding
-+ return address to, we note 2 cases.
-+ If FP is needed and the fp_offset is 0, it means that SP = FP and hence
-+ we return a SP-relative location since all the addresses are SP-relative
-+ in this case. This prevents the store from being optimized away.
-+ If the fp_offset is not 0, then the addresses will be FP-relative and
-+ therefore we return a FP-relative location. */
-
-- /* Mark the store volatile, so no optimization is permitted to remove it. */
-- MEM_VOLATILE_P (tmp) = true;
-- return tmp;
-+ if (frame_pointer_needed)
-+ {
-+ if (fp_offset)
-+ return gen_frame_mem (DImode,
-+ plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
-+ else
-+ return gen_frame_mem (DImode,
-+ plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
-+ }
-+
-+ /* If FP is not needed, we calculate the location of LR, which would be
-+ at the top of the saved registers block. */
-+
-+ return gen_frame_mem (DImode,
-+ plus_constant (Pmode,
-+ stack_pointer_rtx,
-+ fp_offset
-+ + cfun->machine->frame.saved_regs_size
-+ - 2 * UNITS_PER_WORD));
- }
-
- /* Possibly output code to build up a constant in a register. For
---- a/src/gcc/config/aarch64/aarch64.h
-+++ b/src/gcc/config/aarch64/aarch64.h
-@@ -389,9 +389,9 @@
- #define ASM_DECLARE_FUNCTION_NAME(STR, NAME, DECL) \
- aarch64_declare_function_name (STR, NAME, DECL)
-
--/* For EH returns X4 contains the stack adjustment. */
--#define EH_RETURN_STACKADJ_RTX gen_rtx_REG (Pmode, R4_REGNUM)
--#define EH_RETURN_HANDLER_RTX aarch64_eh_return_handler_rtx ()
-+/* The register that holds the return address in exception handlers. */
-+#define AARCH64_EH_STACKADJ_REGNUM (R0_REGNUM + 4)
-+#define EH_RETURN_STACKADJ_RTX gen_rtx_REG (Pmode, AARCH64_EH_STACKADJ_REGNUM)
-
- /* Don't use __builtin_setjmp until we've defined it. */
- #undef DONT_USE_BUILTIN_SETJMP
diff --git a/debian/patches/gcc-linaro.diff b/debian/patches/gcc-linaro.diff
index 2ab9e5d..10dc0ff 100644
--- a/debian/patches/gcc-linaro.diff
+++ b/debian/patches/gcc-linaro.diff
@@ -1,36 +1,12 @@
-# DP: Changes for the Linaro 6-2017.01 release.
+# DP: Changes for the Linaro 6-2017.02 release.
MSG=$(git log origin/linaro/gcc-6-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-6-branch --format=format:"%H" -n 1 --grep "gcc-6-branch@${SVN%.}"
-LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b2388c31a0f6c7a82110ee77baf2c8bf \
+LANG=C git diff --no-renames 9087fb2ff49a31be20f2a118a863b550ac58e26d..aad858c02179d21167c43b18b31b3e8008dae1ed \
| egrep -v '^(diff|index) ' \
| filterdiff --strip=1 --addoldprefix=a/src/ --addnewprefix=b/src/ \
| sed 's,a/src//dev/null,/dev/null,'
---- a/src/configure
-+++ b/src/configure
-@@ -3483,6 +3483,9 @@ case "${target}" in
- *-*-vxworks*)
- noconfigdirs="$noconfigdirs ${libgcj}"
- ;;
-+ aarch64*-*-freebsd*)
-+ noconfigdirs="$noconfigdirs target-libffi"
-+ ;;
- alpha*-*-*vms*)
- noconfigdirs="$noconfigdirs ${libgcj}"
- ;;
---- a/src/configure.ac
-+++ b/src/configure.ac
-@@ -819,6 +819,9 @@ case "${target}" in
- *-*-vxworks*)
- noconfigdirs="$noconfigdirs ${libgcj}"
- ;;
-+ aarch64*-*-freebsd*)
-+ noconfigdirs="$noconfigdirs target-libffi"
-+ ;;
- alpha*-*-*vms*)
- noconfigdirs="$noconfigdirs ${libgcj}"
- ;;
--- a/src/contrib/compare_tests
+++ b/src/contrib/compare_tests
@@ -107,8 +107,8 @@ elif [ -d "$1" -o -d "$2" ] ; then
@@ -134,7 +110,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
--- /dev/null
+++ b/src/gcc/LINARO-VERSION
@@ -0,0 +1 @@
-+Snapshot 6.3-2017.01
++6.3-2017.02~dev
--- a/src/gcc/Makefile.in
+++ b/src/gcc/Makefile.in
@@ -832,10 +832,12 @@ BASEVER := $(srcdir)/BASE-VER # 4.x.y
@@ -204,7 +180,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#include "tm_p.h"
--- a/src/gcc/c-family/c-opts.c
+++ b/src/gcc/c-family/c-opts.c
-@@ -767,8 +767,7 @@ c_common_post_options (const char **pfilename)
+@@ -772,8 +772,7 @@ c_common_post_options (const char **pfilename)
support. */
if (c_dialect_cxx ())
{
@@ -302,19 +278,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
target_type_format_char='%'
c_target_objs="arm-c.o"
cxx_target_objs="arm-c.o"
-@@ -946,6 +946,11 @@ aarch64*-*-elf | aarch64*-*-rtems*)
- done
- TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
- ;;
-+aarch64*-*-freebsd*)
-+ tm_file="${tm_file} dbxelf.h elfos.h ${fbsd_tm_file}"
-+ tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-freebsd.h"
-+ tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-freebsd"
-+ ;;
- aarch64*-*-linux*)
- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
- tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
-@@ -1495,7 +1500,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
+@@ -1500,7 +1500,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
extra_options="${extra_options} linux-android.opt"
# Assume modern glibc if not targeting Android nor uclibc.
case ${target} in
@@ -323,7 +287,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
;;
*)
default_gnu_indirect_function=yes
-@@ -1564,7 +1569,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu)
+@@ -1569,7 +1569,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu)
extra_options="${extra_options} linux-android.opt"
# Assume modern glibc if not targeting Android nor uclibc.
case ${target} in
@@ -332,7 +296,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
;;
*)
default_gnu_indirect_function=yes
-@@ -3806,38 +3811,51 @@ case "${target}" in
+@@ -3811,38 +3811,51 @@ case "${target}" in
# Add extra multilibs
if test "x$with_multilib_list" != x; then
arm_multilibs=`echo $with_multilib_list | sed -e 's/,/ /g'`
@@ -410,17 +374,6 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
fi
;;
---- a/src/gcc/config.host
-+++ b/src/gcc/config.host
-@@ -99,7 +99,7 @@ case ${host} in
- esac
-
- case ${host} in
-- aarch64*-*-linux*)
-+ aarch64*-*-freebsd* | aarch64*-*-linux*)
- case ${target} in
- aarch64*-*-*)
- host_extra_gcc_objs="driver-aarch64.o"
--- a/src/gcc/config/aarch64/aarch64-arches.def
+++ b/src/gcc/config/aarch64/aarch64-arches.def
@@ -32,4 +32,5 @@
@@ -720,103 +673,6 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#define TEXT_SECTION_ASM_OP "\t.text"
#define DATA_SECTION_ASM_OP "\t.data"
#define BSS_SECTION_ASM_OP "\t.bss"
---- /dev/null
-+++ b/src/gcc/config/aarch64/aarch64-freebsd.h
-@@ -0,0 +1,94 @@
-+/* Definitions for AArch64 running FreeBSD
-+ Copyright (C) 2016 Free Software Foundation, Inc.
-+
-+ This file is part of GCC.
-+
-+ GCC is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published by
-+ the Free Software Foundation; either version 3, or (at your option)
-+ any later version.
-+
-+ GCC is distributed in the hope that it will be useful, but
-+ WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ General Public License for more details.
-+
-+ You should have received a copy of the GNU General Public License
-+ along with GCC; see the file COPYING3. If not see
-+ <http://www.gnu.org/licenses/>. */
-+
-+#ifndef GCC_AARCH64_FREEBSD_H
-+#define GCC_AARCH64_FREEBSD_H
-+
-+#undef SUBTARGET_CPP_SPEC
-+#define SUBTARGET_CPP_SPEC FBSD_CPP_SPEC
-+
-+#if TARGET_BIG_ENDIAN_DEFAULT
-+#define TARGET_LINKER_EMULATION "aarch64fbsdb"
-+#else
-+#define TARGET_LINKER_EMULATION "aarch64fbsd"
-+#endif
-+
-+#undef SUBTARGET_EXTRA_LINK_SPEC
-+#define SUBTARGET_EXTRA_LINK_SPEC " -m" TARGET_LINKER_EMULATION
-+
-+#undef FBSD_TARGET_LINK_SPEC
-+#define FBSD_TARGET_LINK_SPEC " \
-+ %{p:%nconsider using `-pg' instead of `-p' with gprof (1) } \
-+ %{v:-V} \
-+ %{assert*} %{R*} %{rpath*} %{defsym*} \
-+ %{shared:-Bshareable %{h*} %{soname*}} \
-+ %{symbolic:-Bsymbolic} \
-+ %{static:-Bstatic} \
-+ %{!static: \
-+ %{rdynamic:-export-dynamic} \
-+ %{!shared:-dynamic-linker " FBSD_DYNAMIC_LINKER " }} \
-+ -X" SUBTARGET_EXTRA_LINK_SPEC " \
-+ %{mbig-endian:-EB} %{mlittle-endian:-EL}"
-+
-+#if TARGET_FIX_ERR_A53_835769_DEFAULT
-+#define CA53_ERR_835769_SPEC \
-+ " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
-+#else
-+#define CA53_ERR_835769_SPEC \
-+ " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
-+#endif
-+
-+#ifdef TARGET_FIX_ERR_A53_843419_DEFAULT
-+#define CA53_ERR_843419_SPEC \
-+ " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
-+#else
-+#define CA53_ERR_843419_SPEC \
-+ " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
-+#endif
-+
-+#undef LINK_SPEC
-+#define LINK_SPEC FBSD_TARGET_LINK_SPEC \
-+ CA53_ERR_835769_SPEC \
-+ CA53_ERR_843419_SPEC
-+
-+#define GNU_USER_TARGET_MATHFILE_SPEC \
-+ "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
-+
-+#undef ENDFILE_SPEC
-+#define ENDFILE_SPEC \
-+ GNU_USER_TARGET_MATHFILE_SPEC " " \
-+ FBSD_ENDFILE_SPEC
-+
-+#undef TARGET_OS_CPP_BUILTINS
-+#define TARGET_OS_CPP_BUILTINS() \
-+ do \
-+ { \
-+ FBSD_TARGET_OS_CPP_BUILTINS (); \
-+ } \
-+ while (false)
-+
-+#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
-+
-+/* Uninitialized common symbols in non-PIE executables, even with
-+ strong definitions in dependent shared libraries, will resolve
-+ to COPY relocated symbol in the executable. See PR65780. */
-+#undef TARGET_BINDS_LOCAL_P
-+#define TARGET_BINDS_LOCAL_P default_binds_local_p_2
-+
-+#endif /* GCC_AARCH64_FREEBSD_H */
--- a/src/gcc/config/aarch64/aarch64-modes.def
+++ b/src/gcc/config/aarch64/aarch64-modes.def
@@ -21,8 +21,6 @@
@@ -918,7 +774,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
-int aarch64_simd_attr_length_move (rtx_insn *);
int aarch64_uxt_size (int, HOST_WIDE_INT);
int aarch64_vec_fpconst_pow_of_2 (rtx);
- rtx aarch64_final_eh_return_addr (void);
+ rtx aarch64_eh_return_handler_rtx (void);
-rtx aarch64_legitimize_reload_address (rtx *, machine_mode, int, int, int);
rtx aarch64_mask_from_zextract_ops (rtx, rtx);
const char *aarch64_output_move_struct (rtx *operands);
@@ -939,6 +795,14 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
+@@ -436,7 +453,6 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode);
+ bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
+ bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
+ bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
+-extern bool aarch64_nopcrelative_literal_loads;
+
+ extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
+ tree, HOST_WIDE_INT);
--- a/src/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/src/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -40,9 +40,10 @@
@@ -2633,7 +2497,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Generic costs for vector insn classes. */
static const struct cpu_vector_cost generic_vector_cost =
{
-@@ -326,6 +377,24 @@ static const struct cpu_vector_cost generic_vector_cost =
+@@ -326,18 +377,36 @@ static const struct cpu_vector_cost generic_vector_cost =
1 /* cond_not_taken_branch_cost */
};
@@ -2658,6 +2522,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Generic costs for vector insn classes. */
static const struct cpu_vector_cost cortexa57_vector_cost =
{
+ 1, /* scalar_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+- 3, /* vec_stmt_cost */
++ 2, /* vec_stmt_cost */
+ 3, /* vec_permute_cost */
+ 8, /* vec_to_scalar_cost */
+ 8, /* scalar_to_vec_cost */
+- 5, /* vec_align_load_cost */
+- 5, /* vec_unalign_load_cost */
++ 4, /* vec_align_load_cost */
++ 4, /* vec_unalign_load_cost */
+ 1, /* vec_unalign_store_cost */
+ 1, /* vec_store_cost */
+ 1, /* cond_taken_branch_cost */
@@ -379,6 +448,24 @@ static const struct cpu_vector_cost xgene1_vector_cost =
1 /* cond_not_taken_branch_cost */
};
@@ -3057,7 +2936,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static bool
aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
tree exp ATTRIBUTE_UNUSED)
-@@ -2490,7 +2765,7 @@ static void
+@@ -2494,7 +2769,7 @@ static void
aarch64_layout_frame (void)
{
HOST_WIDE_INT offset = 0;
@@ -3066,7 +2945,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (reload_completed && cfun->machine->frame.laid_out)
return;
-@@ -2498,8 +2773,8 @@ aarch64_layout_frame (void)
+@@ -2502,8 +2777,8 @@ aarch64_layout_frame (void)
#define SLOT_NOT_REQUIRED (-2)
#define SLOT_REQUIRED (-1)
@@ -3077,7 +2956,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* First mark all the registers that really need to be saved... */
for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
-@@ -2524,7 +2799,10 @@ aarch64_layout_frame (void)
+@@ -2528,7 +2803,10 @@ aarch64_layout_frame (void)
for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
if (df_regs_ever_live_p (regno)
&& !call_used_regs[regno])
@@ -3089,7 +2968,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (frame_pointer_needed)
{
-@@ -2533,7 +2811,6 @@ aarch64_layout_frame (void)
+@@ -2537,7 +2815,6 @@ aarch64_layout_frame (void)
cfun->machine->frame.wb_candidate1 = R29_REGNUM;
cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
cfun->machine->frame.wb_candidate2 = R30_REGNUM;
@@ -3097,7 +2976,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
offset += 2 * UNITS_PER_WORD;
}
-@@ -2542,35 +2819,46 @@ aarch64_layout_frame (void)
+@@ -2546,35 +2823,46 @@ aarch64_layout_frame (void)
if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
{
cfun->machine->frame.reg_offset[regno] = offset;
@@ -3153,7 +3032,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
STACK_BOUNDARY / BITS_PER_UNIT);
cfun->machine->frame.frame_size
-@@ -2578,15 +2866,92 @@ aarch64_layout_frame (void)
+@@ -2582,15 +2870,92 @@ aarch64_layout_frame (void)
+ crtl->outgoing_args_size,
STACK_BOUNDARY / BITS_PER_UNIT);
@@ -3246,7 +3125,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static unsigned
aarch64_next_callee_save (unsigned regno, unsigned limit)
{
-@@ -2595,6 +2960,9 @@ aarch64_next_callee_save (unsigned regno, unsigned limit)
+@@ -2599,6 +2964,9 @@ aarch64_next_callee_save (unsigned regno, unsigned limit)
return regno;
}
@@ -3256,7 +3135,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static void
aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
HOST_WIDE_INT adjustment)
-@@ -2611,6 +2979,10 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
+@@ -2615,6 +2983,10 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
RTX_FRAME_RELATED_P (insn) = 1;
}
@@ -3267,7 +3146,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static rtx
aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
HOST_WIDE_INT adjustment)
-@@ -2630,11 +3002,18 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
+@@ -2634,11 +3006,18 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
}
}
@@ -3288,7 +3167,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
rtx reg1 = gen_rtx_REG (mode, regno1);
rtx reg2 = gen_rtx_REG (mode, regno2);
-@@ -2645,6 +3024,9 @@ aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
+@@ -2649,6 +3028,9 @@ aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
RTX_FRAME_RELATED_P (insn) = 1;
}
@@ -3298,7 +3177,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static rtx
aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
HOST_WIDE_INT adjustment)
-@@ -2662,6 +3044,37 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
+@@ -2666,6 +3048,37 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
}
}
@@ -3336,7 +3215,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static rtx
aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
rtx reg2)
-@@ -2679,6 +3092,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
+@@ -2683,6 +3096,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
}
}
@@ -3346,7 +3225,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static rtx
aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
rtx mem2)
-@@ -2696,6 +3112,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
+@@ -2700,6 +3116,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
}
}
@@ -3356,7 +3235,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static void
aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
-@@ -2754,6 +3173,11 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
+@@ -2758,6 +3177,11 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
}
}
@@ -3368,7 +3247,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static void
aarch64_restore_callee_saves (machine_mode mode,
HOST_WIDE_INT start_offset, unsigned start,
-@@ -2848,23 +3272,16 @@ aarch64_restore_callee_saves (machine_mode mode,
+@@ -2852,23 +3276,16 @@ aarch64_restore_callee_saves (machine_mode mode,
void
aarch64_expand_prologue (void)
{
@@ -3400,7 +3279,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (flag_stack_usage_info)
current_function_static_stack_size = frame_size;
-@@ -2881,129 +3298,28 @@ aarch64_expand_prologue (void)
+@@ -2885,129 +3302,28 @@ aarch64_expand_prologue (void)
aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
}
@@ -3458,8 +3337,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- frame_size = -1;
-
- if (offset > 0)
-+ if (frame_pointer_needed)
- {
+- {
- bool skip_wb = false;
-
- if (frame_pointer_needed)
@@ -3517,16 +3395,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- skip_wb);
- aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
- skip_wb);
-+ if (callee_adjust == 0)
-+ aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
-+ R30_REGNUM, false);
-+ insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
-+ stack_pointer_rtx,
-+ GEN_INT (callee_offset)));
-+ RTX_FRAME_RELATED_P (insn) = 1;
-+ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
- }
-
+- }
+-
- /* when offset >= 512,
- sub sp, sp, #<outgoing_args_size> */
- if (frame_size > -1)
@@ -3538,7 +3408,18 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- GEN_INT (- crtl->outgoing_args_size)));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
-- }
++ if (frame_pointer_needed)
++ {
++ if (callee_adjust == 0)
++ aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
++ R30_REGNUM, false);
++ insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
++ stack_pointer_rtx,
++ GEN_INT (callee_offset)));
++ RTX_FRAME_RELATED_P (insn) = 1;
++ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
+ }
++
+ aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
+ callee_adjust != 0 || frame_pointer_needed);
+ aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
@@ -3547,7 +3428,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
/* Return TRUE if we can use a simple_return insn.
-@@ -3026,150 +3342,79 @@ aarch64_use_return_insn_p (void)
+@@ -3030,151 +3346,80 @@ aarch64_use_return_insn_p (void)
return cfun->machine->frame.frame_size == 0;
}
@@ -3566,7 +3447,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- rtx_insn *insn;
- /* We need to add memory barrier to prevent read from deallocated stack. */
- bool need_barrier_p = (get_frame_size () != 0
-- || cfun->machine->frame.saved_varargs_size);
+- || cfun->machine->frame.saved_varargs_size
+- || crtl->calls_eh_return);
-
aarch64_layout_frame ();
@@ -3602,7 +3484,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- RTX_FRAME_RELATED_P (insn) = 1;
- }
+ /* Emit a barrier to prevent loads from a deallocated stack. */
-+ if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
++ if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
++ || crtl->calls_eh_return)
+ {
+ emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
+ need_barrier_p = false;
@@ -3750,8 +3633,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
RTX_FRAME_RELATED_P (insn) = 1;
}
-@@ -3237,122 +3482,6 @@ aarch64_final_eh_return_addr (void)
- - 2 * UNITS_PER_WORD));
+@@ -3230,122 +3475,6 @@ aarch64_eh_return_handler_rtx (void)
+ return tmp;
}
-/* Possibly output code to build up a constant in a register. For
@@ -3873,7 +3756,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Output code to add DELTA to the first argument, and then jump
to FUNCTION. Used for C++ multiple inheritance. */
static void
-@@ -3373,7 +3502,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -3366,7 +3495,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
emit_note (NOTE_INSN_PROLOGUE_END);
if (vcall_offset == 0)
@@ -3882,7 +3765,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
else
{
gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
-@@ -3389,7 +3518,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -3382,7 +3511,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
plus_constant (Pmode, this_rtx, delta));
else
@@ -3891,7 +3774,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
if (Pmode == ptr_mode)
-@@ -3403,7 +3532,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -3396,7 +3525,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
addr = plus_constant (Pmode, temp0, vcall_offset);
else
{
@@ -3901,7 +3784,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
addr = gen_rtx_PLUS (Pmode, temp0, temp1);
}
-@@ -3582,7 +3712,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+@@ -3575,7 +3705,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
return aarch64_tls_referenced_p (x);
}
@@ -3915,7 +3798,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static unsigned int
aarch64_case_values_threshold (void)
-@@ -3593,7 +3728,7 @@ aarch64_case_values_threshold (void)
+@@ -3586,7 +3721,7 @@ aarch64_case_values_threshold (void)
&& selected_cpu->tune->max_case_values != 0)
return selected_cpu->tune->max_case_values;
else
@@ -3924,7 +3807,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
/* Return true if register REGNO is a valid index register.
-@@ -3928,9 +4063,11 @@ aarch64_classify_address (struct aarch64_address_info *info,
+@@ -3921,9 +4056,11 @@ aarch64_classify_address (struct aarch64_address_info *info,
X,X: 7-bit signed scaled offset
Q: 9-bit signed offset
We conservatively require an offset representable in either mode.
@@ -3938,7 +3821,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& offset_9bit_signed_unscaled_p (mode, offset));
/* A 7bit offset check because OImode will emit a ldp/stp
-@@ -4038,7 +4175,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
+@@ -4031,7 +4168,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
return ((GET_CODE (sym) == LABEL_REF
|| (GET_CODE (sym) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (sym)
@@ -3947,7 +3830,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
return false;
-@@ -4132,6 +4269,24 @@ aarch64_legitimate_address_p (machine_mode mode, rtx x,
+@@ -4125,6 +4262,24 @@ aarch64_legitimate_address_p (machine_mode mode, rtx x,
return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
}
@@ -3972,7 +3855,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Return TRUE if rtx X is immediate constant 0.0 */
bool
aarch64_float_const_zero_rtx_p (rtx x)
-@@ -4205,6 +4360,14 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
+@@ -4198,6 +4353,14 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
&& (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
return CC_NZmode;
@@ -3987,7 +3870,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
&& y == const0_rtx
&& (code == EQ || code == NE || code == LT || code == GE)
-@@ -4232,14 +4395,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
+@@ -4225,14 +4388,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
&& GET_CODE (x) == NEG)
return CC_Zmode;
@@ -4002,7 +3885,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* A test for unsigned overflow. */
if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
&& code == NE
-@@ -4308,8 +4463,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
+@@ -4301,8 +4456,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
break;
case CC_SWPmode:
@@ -4011,7 +3894,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
switch (comp_code)
{
case NE: return AARCH64_NE;
-@@ -4964,7 +5117,7 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
+@@ -4957,7 +5110,7 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
{
rtx base = XEXP (x, 0);
@@ -4020,7 +3903,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
HOST_WIDE_INT offset = INTVAL (offset_rtx);
if (GET_CODE (base) == PLUS)
-@@ -5022,120 +5175,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
+@@ -5015,120 +5168,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
return x;
}
@@ -4141,7 +4024,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Return the reload icode required for a constant pool in mode. */
static enum insn_code
aarch64_constant_pool_reload_icode (machine_mode mode)
-@@ -5193,7 +5232,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+@@ -5186,7 +5225,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
&& (SCALAR_FLOAT_MODE_P (GET_MODE (x))
|| targetm.vector_mode_supported_p (GET_MODE (x)))
@@ -4150,7 +4033,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
sri->icode = aarch64_constant_pool_reload_icode (mode);
return NO_REGS;
-@@ -5267,18 +5306,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
+@@ -5260,18 +5299,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
if (to == HARD_FRAME_POINTER_REGNUM)
{
if (from == ARG_POINTER_REGNUM)
@@ -4174,7 +4057,31 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
return cfun->machine->frame.frame_size;
-@@ -5527,7 +5566,7 @@ aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
+@@ -5418,7 +5457,10 @@ aarch64_elf_asm_constructor (rtx symbol, int priority)
+ else
+ {
+ section *s;
+- char buf[18];
++ /* While priority is known to be in range [0, 65535], so 18 bytes
++ would be enough, the compiler might not know that. To avoid
++ -Wformat-truncation false positive, use a larger size. */
++ char buf[23];
+ snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
+ s = get_section (buf, SECTION_WRITE, NULL);
+ switch_to_section (s);
+@@ -5435,7 +5477,10 @@ aarch64_elf_asm_destructor (rtx symbol, int priority)
+ else
+ {
+ section *s;
+- char buf[18];
++ /* While priority is known to be in range [0, 65535], so 18 bytes
++ would be enough, the compiler might not know that. To avoid
++ -Wformat-truncation false positive, use a larger size. */
++ char buf[23];
+ snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
+ s = get_section (buf, SECTION_WRITE, NULL);
+ switch_to_section (s);
+@@ -5520,7 +5565,7 @@ aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
static inline bool
aarch64_can_use_per_function_literal_pools_p (void)
{
@@ -4183,7 +4090,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
|| aarch64_cmodel == AARCH64_CMODEL_LARGE);
}
-@@ -6146,6 +6185,19 @@ aarch64_extend_bitfield_pattern_p (rtx x)
+@@ -6139,6 +6184,19 @@ aarch64_extend_bitfield_pattern_p (rtx x)
return op;
}
@@ -4203,7 +4110,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Calculate the cost of calculating X, storing it in *COST. Result
is true if the total cost of the operation has now been calculated. */
static bool
-@@ -6411,10 +6463,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
+@@ -6404,10 +6462,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
/* TODO: A write to the CC flags possibly costs extra, this
needs encoding in the cost tables. */
@@ -4214,7 +4121,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
mode = GET_MODE (op0);
/* ANDS. */
if (GET_CODE (op0) == AND)
-@@ -6724,17 +6772,31 @@ cost_plus:
+@@ -6717,17 +6771,31 @@ cost_plus:
if (GET_MODE_CLASS (mode) == MODE_INT)
{
@@ -4254,7 +4161,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
else
{
-@@ -6838,11 +6900,12 @@ cost_plus:
+@@ -6831,11 +6899,12 @@ cost_plus:
{
int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
@@ -4272,7 +4179,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
*cost = op_cost;
return true;
-@@ -6872,8 +6935,8 @@ cost_plus:
+@@ -6865,8 +6934,8 @@ cost_plus:
}
else
{
@@ -4283,7 +4190,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
}
return false;
-@@ -7452,12 +7515,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
+@@ -7445,12 +7514,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
to optimize 1.0/sqrt. */
static bool
@@ -4299,7 +4206,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
|| flag_mrecip_low_precision_sqrt));
}
-@@ -7467,89 +7530,225 @@ use_rsqrt_p (void)
+@@ -7460,89 +7529,225 @@ use_rsqrt_p (void)
static tree
aarch64_builtin_reciprocal (tree fndecl)
{
@@ -4381,13 +4288,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- mode == SFmode || mode == V2SFmode || mode == V4SFmode
- || mode == DFmode || mode == V2DFmode);
+ machine_mode mode = GET_MODE (dst);
-+
-+ if (GET_MODE_INNER (mode) == HFmode)
-+ return false;
- rtx xsrc = gen_reg_rtx (mode);
- emit_move_insn (xsrc, src);
- rtx x0 = gen_reg_rtx (mode);
++ if (GET_MODE_INNER (mode) == HFmode)
++ return false;
++
+ machine_mode mmsk = mode_for_vector
+ (int_mode_for_mode (GET_MODE_INNER (mode)),
+ GET_MODE_NUNITS (mode));
@@ -4486,12 +4393,9 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ default: gcc_unreachable ();
+ }
+}
-
-- emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
++
+typedef rtx (*recps_type) (rtx, rtx, rtx);
-
-- emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-- x0 = x1;
++
+/* Select reciprocal series step insn depending on machine mode. */
+
+static recps_type
@@ -4533,10 +4437,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ /* Estimate the approximate reciprocal. */
+ rtx xrcp = gen_reg_rtx (mode);
+ emit_insn ((*get_recpe_type (mode)) (xrcp, den));
-+
+
+- emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
-+
+
+- emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+- x0 = x1;
+ /* Optionally iterate over the series once less for faster performance,
+ while sacrificing the accuracy. */
+ if (flag_mlow_precision_div)
@@ -4550,24 +4457,24 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ if (iterations > 0)
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
-+ }
-+
+ }
+
+- emit_move_insn (dst, x0);
+ if (num != CONST1_RTX (mode))
+ {
+ /* As the approximate reciprocal of DEN is already calculated, only
+ calculate the approximate division when NUM is not 1.0. */
+ rtx xnum = force_reg (mode, num);
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
- }
-
-- emit_move_insn (dst, x0);
++ }
++
+ /* Finalize the approximation. */
+ emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+ return true;
}
/* Return the number of instructions that can be issued per cycle. */
-@@ -8053,32 +8252,37 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
+@@ -8046,32 +8251,37 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
opts->x_align_functions = aarch64_tune_params.function_align;
}
@@ -4622,7 +4529,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
/* 'Unpack' up the internal tuning structs and update the options
-@@ -9286,15 +9490,18 @@ aarch64_classify_symbol (rtx x, rtx offset)
+@@ -9279,15 +9489,18 @@ aarch64_classify_symbol (rtx x, rtx offset)
switch (aarch64_cmodel)
{
case AARCH64_CMODEL_TINY:
@@ -4644,7 +4551,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
|| INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
return SYMBOL_FORCE_TO_MEM;
return SYMBOL_TINY_ABSOLUTE;
-@@ -9302,7 +9509,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
+@@ -9295,7 +9508,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
case AARCH64_CMODEL_SMALL:
/* Same reasoning as the tiny code model, but the offset cap here is
4G. */
@@ -4654,7 +4561,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
|| !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
HOST_WIDE_INT_C (4294967264)))
return SYMBOL_FORCE_TO_MEM;
-@@ -9324,8 +9532,7 @@ aarch64_classify_symbol (rtx x, rtx offset)
+@@ -9317,8 +9531,7 @@ aarch64_classify_symbol (rtx x, rtx offset)
/* This is alright even in PIC code as the constant
pool reference is always PC relative and within
the same translation unit. */
@@ -4664,7 +4571,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return SYMBOL_SMALL_ABSOLUTE;
else
return SYMBOL_FORCE_TO_MEM;
-@@ -9461,6 +9668,13 @@ aarch64_build_builtin_va_list (void)
+@@ -9454,6 +9667,13 @@ aarch64_build_builtin_va_list (void)
FIELD_DECL, get_identifier ("__vr_offs"),
integer_type_node);
@@ -4678,7 +4585,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DECL_ARTIFICIAL (f_stack) = 1;
DECL_ARTIFICIAL (f_grtop) = 1;
DECL_ARTIFICIAL (f_vrtop) = 1;
-@@ -9493,15 +9707,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
+@@ -9486,15 +9706,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
tree stack, grtop, vrtop, groff, vroff;
tree t;
@@ -4702,7 +4609,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (!TARGET_FLOAT)
{
-@@ -9830,7 +10046,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9823,7 +10045,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
CUMULATIVE_ARGS local_cum;
@@ -4712,7 +4619,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* The caller has advanced CUM up to, but not beyond, the last named
argument. Advance a local copy of CUM past the last "real" named
-@@ -9838,9 +10055,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9831,9 +10054,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
local_cum = *cum;
aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
@@ -4730,7 +4637,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (!TARGET_FLOAT)
{
-@@ -9868,7 +10090,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9861,7 +10089,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
/* We can't use move_block_from_reg, because it will use
the wrong mode, storing D regs only. */
machine_mode mode = TImode;
@@ -4739,7 +4646,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Set OFF to the offset from virtual_incoming_args_rtx of
the first vector register. The VR save area lies below
-@@ -9877,14 +10099,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+@@ -9870,14 +10098,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
STACK_BOUNDARY / BITS_PER_UNIT);
off -= vr_saved * UNITS_PER_VREG;
@@ -4757,7 +4664,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
off += UNITS_PER_VREG;
}
}
-@@ -10846,33 +11069,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
+@@ -10839,33 +11068,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
gen_rtx_REG (mode, rsrc + count - i - 1));
}
@@ -4791,7 +4698,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
one of VSTRUCT modes: OI, CI, or XI. */
int
-@@ -11954,12 +12150,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
+@@ -11947,12 +12149,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
info.value = GEN_INT (0);
else
{
@@ -4805,7 +4712,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (lane_count == 1)
snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
-@@ -12193,6 +12388,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
+@@ -12186,6 +12387,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_trn2v4si; break;
case V2SImode: gen = gen_aarch64_trn2v2si; break;
case V2DImode: gen = gen_aarch64_trn2v2di; break;
@@ -4814,7 +4721,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
case V2DFmode: gen = gen_aarch64_trn2v2df; break;
-@@ -12211,6 +12408,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
+@@ -12204,6 +12407,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_trn1v4si; break;
case V2SImode: gen = gen_aarch64_trn1v2si; break;
case V2DImode: gen = gen_aarch64_trn1v2di; break;
@@ -4823,7 +4730,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
case V2DFmode: gen = gen_aarch64_trn1v2df; break;
-@@ -12276,6 +12475,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
+@@ -12269,6 +12474,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_uzp2v4si; break;
case V2SImode: gen = gen_aarch64_uzp2v2si; break;
case V2DImode: gen = gen_aarch64_uzp2v2di; break;
@@ -4832,7 +4739,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
-@@ -12294,6 +12495,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
+@@ -12287,6 +12494,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_uzp1v4si; break;
case V2SImode: gen = gen_aarch64_uzp1v2si; break;
case V2DImode: gen = gen_aarch64_uzp1v2di; break;
@@ -4841,7 +4748,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
-@@ -12364,6 +12567,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
+@@ -12357,6 +12566,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_zip2v4si; break;
case V2SImode: gen = gen_aarch64_zip2v2si; break;
case V2DImode: gen = gen_aarch64_zip2v2di; break;
@@ -4850,7 +4757,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
case V2DFmode: gen = gen_aarch64_zip2v2df; break;
-@@ -12382,6 +12587,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
+@@ -12375,6 +12586,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
case V4SImode: gen = gen_aarch64_zip1v4si; break;
case V2SImode: gen = gen_aarch64_zip1v2si; break;
case V2DImode: gen = gen_aarch64_zip1v2di; break;
@@ -4859,7 +4766,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
case V2DFmode: gen = gen_aarch64_zip1v2df; break;
-@@ -12426,6 +12633,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
+@@ -12419,6 +12632,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
case V8HImode: gen = gen_aarch64_extv8hi; break;
case V2SImode: gen = gen_aarch64_extv2si; break;
case V4SImode: gen = gen_aarch64_extv4si; break;
@@ -4868,7 +4775,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V2SFmode: gen = gen_aarch64_extv2sf; break;
case V4SFmode: gen = gen_aarch64_extv4sf; break;
case V2DImode: gen = gen_aarch64_extv2di; break;
-@@ -12501,6 +12710,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
+@@ -12494,6 +12709,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
case V2SImode: gen = gen_aarch64_rev64v2si; break;
case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
@@ -4877,7 +4784,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
default:
return false;
}
-@@ -12744,24 +12955,6 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
+@@ -12737,24 +12954,6 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
return ret;
}
@@ -4902,7 +4809,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
rtx
aarch64_reverse_mask (enum machine_mode mode)
{
-@@ -12783,7 +12976,14 @@ aarch64_reverse_mask (enum machine_mode mode)
+@@ -12776,7 +12975,14 @@ aarch64_reverse_mask (enum machine_mode mode)
return force_reg (V16QImode, mask);
}
@@ -4918,7 +4825,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
bool
aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
-@@ -12794,9 +12994,12 @@ aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+@@ -12787,9 +12993,12 @@ aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
/* We specifically want to allow elements of "structure" modes to
be tieable to the structure. This more general condition allows
other rarer situations too. */
@@ -4934,7 +4841,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return true;
return false;
-@@ -13312,6 +13515,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
+@@ -13305,6 +13514,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
return false;
}
@@ -4949,7 +4856,34 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* If MEM is in the form of [base+offset], extract the two parts
of address and set to BASE and OFFSET, otherwise return false
after clearing BASE and OFFSET. */
-@@ -13490,6 +13701,15 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
+@@ -13449,6 +13666,26 @@ aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
+ return;
+ }
+
++/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
++ Adjust priority of sha1h instructions so they are scheduled before
++ other SHA1 instructions. */
++
++static int
++aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
++{
++ rtx x = PATTERN (insn);
++
++ if (GET_CODE (x) == SET)
++ {
++ x = SET_SRC (x);
++
++ if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
++ return priority + 10;
++ }
++
++ return priority;
++}
++
+ /* Given OPERANDS of consecutive load/store, check if we can merge
+ them into ldp/stp. LOAD is true if they are load instructions.
+ MODE is the mode of memory operands. */
+@@ -13483,6 +13720,15 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
return false;
@@ -4965,7 +4899,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Check if the addresses are in the form of [base+offset]. */
extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
-@@ -13649,6 +13869,15 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
+@@ -13642,6 +13888,15 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
return false;
}
@@ -4981,7 +4915,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
rclass_1 = FP_REGS;
else
-@@ -13884,13 +14113,13 @@ aarch64_promoted_type (const_tree t)
+@@ -13877,13 +14132,13 @@ aarch64_promoted_type (const_tree t)
/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
static bool
@@ -4997,7 +4931,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
default:
return true;
-@@ -14024,6 +14253,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+@@ -14017,6 +14272,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
@@ -5008,7 +4942,17 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#undef TARGET_LIBGCC_CMP_RETURN_MODE
#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
-@@ -14227,6 +14460,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+@@ -14196,6 +14455,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+ #undef TARGET_CAN_USE_DOLOOP_P
+ #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
+
++#undef TARGET_SCHED_ADJUST_PRIORITY
++#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
++
+ #undef TARGET_SCHED_MACRO_FUSION_P
+ #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
+
+@@ -14220,6 +14482,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
#undef TARGET_OPTAB_SUPPORTED_P
#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
@@ -5195,7 +5139,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
UNSPEC_USHL_2S
UNSPEC_VSTRUCTDUMMY
UNSPEC_SP_SET
-@@ -856,13 +860,6 @@
+@@ -837,13 +841,6 @@
|| aarch64_is_noplt_call_p (callee)))
XEXP (operands[0], 0) = force_reg (Pmode, callee);
@@ -5209,7 +5153,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (operands[2] == NULL_RTX)
operands[2] = const0_rtx;
-@@ -894,14 +891,6 @@
+@@ -875,14 +872,6 @@
|| aarch64_is_noplt_call_p (callee)))
XEXP (operands[1], 0) = force_reg (Pmode, callee);
@@ -5224,7 +5168,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (operands[3] == NULL_RTX)
operands[3] = const0_rtx;
-@@ -1179,11 +1168,12 @@
+@@ -1160,11 +1149,12 @@
)
(define_insn "*movhf_aarch64"
@@ -5239,7 +5183,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
mov\\t%0.h[0], %w1
umov\\t%w0, %1.h[0]
mov\\t%0.h[0], %1.h[0]
-@@ -1192,18 +1182,18 @@
+@@ -1173,18 +1163,18 @@
ldrh\\t%w0, %1
strh\\t%w1, %0
mov\\t%w0, %w1"
@@ -5263,7 +5207,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
fmov\\t%s0, %w1
fmov\\t%w0, %s1
fmov\\t%s0, %s1
-@@ -1213,16 +1203,18 @@
+@@ -1194,16 +1184,18 @@
ldr\\t%w0, %1
str\\t%w1, %0
mov\\t%w0, %w1"
@@ -5286,7 +5230,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
fmov\\t%d0, %x1
fmov\\t%x0, %d1
fmov\\t%d0, %d1
-@@ -1232,8 +1224,9 @@
+@@ -1213,8 +1205,9 @@
ldr\\t%x0, %1
str\\t%x1, %0
mov\\t%x0, %x1"
@@ -5298,7 +5242,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_insn "*movtf_aarch64"
-@@ -1258,7 +1251,6 @@
+@@ -1239,7 +1232,6 @@
[(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
f_loadd,f_stored,load2,store2,store2")
(set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
@@ -5306,7 +5250,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(set_attr "simd" "yes,*,*,*,yes,*,*,*,*,*,*")]
)
-@@ -1571,10 +1563,10 @@
+@@ -1552,10 +1544,10 @@
(zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m")))]
""
"@
@@ -5319,7 +5263,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_expand "<optab>qihi2"
-@@ -1583,16 +1575,26 @@
+@@ -1564,16 +1556,26 @@
""
)
@@ -5350,7 +5294,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
;; -------------------------------------------------------------------
;; Simple arithmetic
;; -------------------------------------------------------------------
-@@ -1604,25 +1606,12 @@
+@@ -1585,25 +1587,16 @@
(match_operand:GPI 2 "aarch64_pluslong_operand" "")))]
""
{
@@ -5373,16 +5317,20 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- operands[2] = GEN_INT (s);
- }
- }
++ /* If operands[1] is a subreg extract the inner RTX. */
++ rtx op1 = REG_P (operands[1]) ? operands[1] : SUBREG_REG (operands[1]);
++
+ /* If the constant is too large for a single instruction and isn't frame
+ based, split off the immediate so it is available for CSE. */
+ if (!aarch64_plus_immediate (operands[2], <MODE>mode)
+ && can_create_pseudo_p ()
-+ && !REGNO_PTR_FRAME_P (REGNO (operands[1])))
++ && (!REG_P (op1)
++ || !REGNO_PTR_FRAME_P (REGNO (op1))))
+ operands[2] = force_reg (<MODE>mode, operands[2]);
})
(define_insn "*add<mode>3_aarch64"
-@@ -1784,7 +1773,7 @@
+@@ -1765,7 +1758,7 @@
"aarch64_zero_extend_const_eq (<DWI>mode, operands[2],
<MODE>mode, operands[1])"
"@
@@ -5391,7 +5339,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
cmp\\t%<w>0, #%n1"
[(set_attr "type" "alus_imm")]
)
-@@ -1816,11 +1805,11 @@
+@@ -1797,11 +1790,11 @@
"aarch64_zero_extend_const_eq (<DWI>mode, operands[3],
<MODE>mode, operands[2])"
"@
@@ -5405,7 +5353,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "add<mode>3_compareC"
[(set (reg:CC_C CC_REGNUM)
(ne:CC_C
-@@ -3423,7 +3412,9 @@
+@@ -3404,7 +3397,9 @@
(LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
(match_operand:SI 2 "aarch64_logical_operand" "r,K"))))]
""
@@ -5416,7 +5364,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
[(set_attr "type" "logic_reg,logic_imm")]
)
-@@ -3436,7 +3427,9 @@
+@@ -3417,7 +3412,9 @@
(set (match_operand:GPI 0 "register_operand" "=r,r")
(and:GPI (match_dup 1) (match_dup 2)))]
""
@@ -5427,7 +5375,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
[(set_attr "type" "logics_reg,logics_imm")]
)
-@@ -3450,7 +3443,9 @@
+@@ -3431,7 +3428,9 @@
(set (match_operand:DI 0 "register_operand" "=r,r")
(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
""
@@ -5438,7 +5386,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
[(set_attr "type" "logics_reg,logics_imm")]
)
-@@ -3776,16 +3771,23 @@
+@@ -3757,16 +3756,23 @@
[(set_attr "type" "rbit")]
)
@@ -5471,7 +5419,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "*and<mode>_compare0"
[(set (reg:CC_NZ CC_REGNUM)
-@@ -3797,6 +3799,18 @@
+@@ -3778,6 +3784,18 @@
[(set_attr "type" "alus_imm")]
)
@@ -5490,7 +5438,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "*and<mode>3nr_compare0"
[(set (reg:CC_NZ CC_REGNUM)
(compare:CC_NZ
-@@ -3804,7 +3818,9 @@
+@@ -3785,7 +3803,9 @@
(match_operand:GPI 1 "aarch64_logical_operand" "r,<lconst>"))
(const_int 0)))]
""
@@ -5501,7 +5449,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
[(set_attr "type" "logics_reg,logics_imm")]
)
-@@ -3870,22 +3886,16 @@
+@@ -3851,22 +3871,16 @@
(define_expand "ashl<mode>3"
[(set (match_operand:SHORT 0 "register_operand")
(ashift:SHORT (match_operand:SHORT 1 "register_operand")
@@ -5530,7 +5478,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
)
-@@ -3934,33 +3944,35 @@
+@@ -3915,33 +3929,35 @@
;; Logical left shift using SISD or Integer instruction
(define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
@@ -5551,7 +5499,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- [(set_attr "simd" "no,yes,yes")
- (set_attr "type" "shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
+ [(set_attr "simd" "no,no,yes,yes")
-+ (set_attr "type" "bfm,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
++ (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
)
;; Logical right shift using SISD or Integer instruction
@@ -5574,11 +5522,11 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- [(set_attr "simd" "no,yes,yes,yes")
- (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+ [(set_attr "simd" "no,no,yes,yes,yes")
-+ (set_attr "type" "bfm,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
++ (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
)
(define_split
-@@ -3995,18 +4007,19 @@
+@@ -3976,18 +3992,19 @@
;; Arithmetic right shift using SISD or Integer instruction
(define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
@@ -5599,11 +5547,11 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- [(set_attr "simd" "no,yes,yes,yes")
- (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+ [(set_attr "simd" "no,no,yes,yes,yes")
-+ (set_attr "type" "bfm,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
++ (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
)
(define_split
-@@ -4098,21 +4111,25 @@
+@@ -4079,21 +4096,25 @@
[(set (match_operand:GPI 0 "register_operand" "=r,r")
(rotatert:GPI
(match_operand:GPI 1 "register_operand" "r,r")
@@ -5633,11 +5581,20 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ "@
+ <shift>\\t%w0, %w1, %2
+ <shift>\\t%w0, %w1, %w2"
-+ [(set_attr "type" "bfm,shift_reg")]
++ [(set_attr "type" "bfx,shift_reg")]
)
(define_insn "*<optab><mode>3_insn"
-@@ -4136,7 +4153,7 @@
+@@ -4105,7 +4126,7 @@
+ operands[3] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+ return "<bfshift>\t%w0, %w1, %2, %3";
+ }
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
+ )
+
+ (define_insn "*extr<mode>5_insn"
+@@ -4117,7 +4138,7 @@
"UINTVAL (operands[3]) < GET_MODE_BITSIZE (<MODE>mode) &&
(UINTVAL (operands[3]) + UINTVAL (operands[4]) == GET_MODE_BITSIZE (<MODE>mode))"
"extr\\t%<w>0, %<w>1, %<w>2, %4"
@@ -5646,7 +5603,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
;; There are no canonicalisation rules for ashift and lshiftrt inside an ior
-@@ -4151,7 +4168,7 @@
+@@ -4132,7 +4153,7 @@
&& (UINTVAL (operands[3]) + UINTVAL (operands[4])
== GET_MODE_BITSIZE (<MODE>mode))"
"extr\\t%<w>0, %<w>1, %<w>2, %4"
@@ -5655,7 +5612,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
;; zero_extend version of the above
-@@ -4165,7 +4182,7 @@
+@@ -4146,7 +4167,7 @@
"UINTVAL (operands[3]) < 32 &&
(UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
"extr\\t%w0, %w1, %w2, %4"
@@ -5664,7 +5621,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_insn "*extrsi5_insn_uxtw_alt"
-@@ -4178,7 +4195,7 @@
+@@ -4159,7 +4180,7 @@
"UINTVAL (operands[3]) < 32 &&
(UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
"extr\\t%w0, %w1, %w2, %4"
@@ -5673,7 +5630,72 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_insn "*ror<mode>3_insn"
-@@ -4367,9 +4384,7 @@
+@@ -4198,7 +4219,7 @@
+ operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
+ return "<su>bfiz\t%<GPI:w>0, %<GPI:w>1, %2, %3";
+ }
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
+ )
+
+ (define_insn "*zero_extend<GPI:mode>_lshr<SHORT:mode>"
+@@ -4211,7 +4232,7 @@
+ operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
+ return "ubfx\t%<GPI:w>0, %<GPI:w>1, %2, %3";
+ }
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
+ )
+
+ (define_insn "*extend<GPI:mode>_ashr<SHORT:mode>"
+@@ -4224,7 +4245,7 @@
+ operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
+ return "sbfx\\t%<GPI:w>0, %<GPI:w>1, %2, %3";
+ }
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
+ )
+
+ ;; -------------------------------------------------------------------
+@@ -4256,7 +4277,27 @@
+ "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]),
+ 1, GET_MODE_BITSIZE (<MODE>mode) - 1)"
+ "<su>bfx\\t%<w>0, %<w>1, %3, %2"
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
++)
++
++;; When the bit position and width add up to 32 we can use a W-reg LSR
++;; instruction taking advantage of the implicit zero-extension of the X-reg.
++(define_split
++ [(set (match_operand:DI 0 "register_operand")
++ (zero_extract:DI (match_operand:DI 1 "register_operand")
++ (match_operand 2
++ "aarch64_simd_shift_imm_offset_di")
++ (match_operand 3
++ "aarch64_simd_shift_imm_di")))]
++ "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]), 1,
++ GET_MODE_BITSIZE (DImode) - 1)
++ && (INTVAL (operands[2]) + INTVAL (operands[3]))
++ == GET_MODE_BITSIZE (SImode)"
++ [(set (match_dup 0)
++ (zero_extend:DI (lshiftrt:SI (match_dup 4) (match_dup 3))))]
++ {
++ operands[4] = gen_lowpart (SImode, operands[1]);
++ }
+ )
+
+ ;; Bitfield Insert (insv)
+@@ -4338,7 +4379,7 @@
+ : GEN_INT (<GPI:sizen> - UINTVAL (operands[2]));
+ return "<su>bfiz\t%<GPI:w>0, %<GPI:w>1, %2, %3";
+ }
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
+ )
+
+ ;; XXX We should match (any_extend (ashift)) here, like (and (ashift)) below
+@@ -4348,11 +4389,27 @@
(and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand" "r")
(match_operand 2 "const_int_operand" "n"))
(match_operand 3 "const_int_operand" "n")))]
@@ -5682,9 +5704,30 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
- && (INTVAL (operands[3]) & ((1 << INTVAL (operands[2])) - 1)) == 0"
+ "aarch64_mask_and_shift_for_ubfiz_p (<MODE>mode, operands[3], operands[2])"
"ubfiz\\t%<w>0, %<w>1, %2, %P3"
- [(set_attr "type" "bfm")]
+- [(set_attr "type" "bfm")]
++ [(set_attr "type" "bfx")]
++)
++
++;; When the bit position and width of the equivalent extraction add up to 32
++;; we can use a W-reg LSL instruction taking advantage of the implicit
++;; zero-extension of the X-reg.
++(define_split
++ [(set (match_operand:DI 0 "register_operand")
++ (and:DI (ashift:DI (match_operand:DI 1 "register_operand")
++ (match_operand 2 "const_int_operand"))
++ (match_operand 3 "const_int_operand")))]
++ "aarch64_mask_and_shift_for_ubfiz_p (DImode, operands[3], operands[2])
++ && (INTVAL (operands[2]) + popcount_hwi (INTVAL (operands[3])))
++ == GET_MODE_BITSIZE (SImode)"
++ [(set (match_dup 0)
++ (zero_extend:DI (ashift:SI (match_dup 4) (match_dup 2))))]
++ {
++ operands[4] = gen_lowpart (SImode, operands[1]);
++ }
)
-@@ -4439,22 +4454,23 @@
+
+ (define_insn "bswap<mode>2"
+@@ -4420,22 +4477,23 @@
;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
(define_insn "<frint_pattern><mode>2"
@@ -5715,7 +5758,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
[(set_attr "type" "f_cvtf2i")]
)
-@@ -4480,23 +4496,24 @@
+@@ -4461,23 +4519,24 @@
;; fma - no throw
(define_insn "fma<mode>4"
@@ -5750,30 +5793,30 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_insn "fms<mode>4"
-@@ -4582,19 +4599,11 @@
+@@ -4563,19 +4622,11 @@
[(set_attr "type" "f_cvt")]
)
-(define_insn "fix_trunc<GPF:mode><GPI:mode>2"
-- [(set (match_operand:GPI 0 "register_operand" "=r")
++(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
+ [(set (match_operand:GPI 0 "register_operand" "=r")
- (fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
-- "TARGET_FLOAT"
++ (FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
+ "TARGET_FLOAT"
- "fcvtzs\\t%<GPI:w>0, %<GPF:s>1"
- [(set_attr "type" "f_cvtf2i")]
-)
-
-(define_insn "fixuns_trunc<GPF:mode><GPI:mode>2"
-+(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
- [(set (match_operand:GPI 0 "register_operand" "=r")
+- [(set (match_operand:GPI 0 "register_operand" "=r")
- (unsigned_fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
-+ (FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
- "TARGET_FLOAT"
+- "TARGET_FLOAT"
- "fcvtzu\\t%<GPI:w>0, %<GPF:s>1"
+ "fcvtz<su>\t%<GPI:w>0, %<GPF_F16:s>1"
[(set_attr "type" "f_cvtf2i")]
)
-@@ -4618,38 +4627,116 @@
+@@ -4599,38 +4650,116 @@
[(set_attr "type" "f_cvti2f")]
)
@@ -5905,7 +5948,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_insn "*fnmul<mode>3"
-@@ -4672,38 +4759,58 @@
+@@ -4653,38 +4782,58 @@
[(set_attr "type" "fmul<s>")]
)
@@ -5980,7 +6023,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
;; Given that smax/smin do not specify the result when either input is NaN,
-@@ -4728,15 +4835,17 @@
+@@ -4709,15 +4858,17 @@
[(set_attr "type" "f_minmax<s>")]
)
@@ -6006,7 +6049,25 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
;; For copysign (x, y), we want to generate:
-@@ -5201,7 +5310,7 @@
+@@ -4775,7 +4926,7 @@
+ [(set (match_operand:GPF_TF 0 "register_operand" "=w")
+ (mem:GPF_TF (match_operand 1 "aarch64_constant_pool_symref" "S")))
+ (clobber (match_operand:P 2 "register_operand" "=&r"))]
+- "TARGET_FLOAT && aarch64_nopcrelative_literal_loads"
++ "TARGET_FLOAT"
+ {
+ aarch64_expand_mov_immediate (operands[2], XEXP (operands[1], 0));
+ emit_move_insn (operands[0], gen_rtx_MEM (<GPF_TF:MODE>mode, operands[2]));
+@@ -4788,7 +4939,7 @@
+ [(set (match_operand:VALL 0 "register_operand" "=w")
+ (mem:VALL (match_operand 1 "aarch64_constant_pool_symref" "S")))
+ (clobber (match_operand:P 2 "register_operand" "=&r"))]
+- "TARGET_FLOAT && aarch64_nopcrelative_literal_loads"
++ "TARGET_FLOAT"
+ {
+ aarch64_expand_mov_immediate (operands[2], XEXP (operands[1], 0));
+ emit_move_insn (operands[0], gen_rtx_MEM (<VALL:MODE>mode, operands[2]));
+@@ -5182,7 +5333,7 @@
UNSPEC_SP_TEST))
(clobber (match_scratch:PTR 3 "=&r"))]
""
@@ -50878,30 +50939,17 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/aarch64/aarch64-c.c
---- /dev/null
-+++ b/src/gcc/config/aarch64/t-aarch64-freebsd
-@@ -0,0 +1,21 @@
-+# Machine description for AArch64 architecture.
-+# Copyright (C) 2016 Free Software Foundation, Inc.
-+#
-+# This file is part of GCC.
-+#
-+# GCC is free software; you can redistribute it and/or modify it
-+# under the terms of the GNU General Public License as published by
-+# the Free Software Foundation; either version 3, or (at your option)
-+# any later version.
-+#
-+# GCC is distributed in the hope that it will be useful, but
-+# WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+# General Public License for more details.
-+#
-+# You should have received a copy of the GNU General Public License
-+# along with GCC; see the file COPYING3. If not see
-+# <http://www.gnu.org/licenses/>.
-+
-+LIB1ASMSRC = aarch64/lib1funcs.asm
-+LIB1ASMFUNCS = _aarch64_sync_cache_range
+--- a/src/gcc/config/aarch64/thunderx.md
++++ b/src/gcc/config/aarch64/thunderx.md
+@@ -39,7 +39,7 @@
+
+ (define_insn_reservation "thunderx_shift" 1
+ (and (eq_attr "tune" "thunderx")
+- (eq_attr "type" "bfm,extend,rotate_imm,shift_imm,shift_reg,rbit,rev"))
++ (eq_attr "type" "bfm,bfx,extend,rotate_imm,shift_imm,shift_reg,rbit,rev"))
+ "thunderx_pipe0 | thunderx_pipe1")
+
+
--- a/src/gcc/config/alpha/alpha.c
+++ b/src/gcc/config/alpha/alpha.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3. If not see
@@ -52143,7 +52191,15 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
extern void thumb_set_return_address (rtx, rtx);
extern const char *thumb1_output_casesi (rtx *);
extern const char *thumb2_output_casesi (rtx *);
-@@ -319,6 +327,7 @@ extern int vfp3_const_double_for_bits (rtx);
+@@ -256,7 +264,6 @@ struct cpu_cost_table;
+
+ struct tune_params
+ {
+- bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool);
+ const struct cpu_cost_table *insn_extra_cost;
+ bool (*sched_adjust_cost) (rtx_insn *, rtx, rtx_insn *, int *);
+ int (*branch_cost) (bool, bool);
+@@ -319,6 +326,7 @@ extern int vfp3_const_double_for_bits (rtx);
extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
rtx);
@@ -52151,7 +52207,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
extern bool arm_valid_symbolic_address_p (rtx);
extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
#endif /* RTX_CODE */
-@@ -344,184 +353,6 @@ extern void arm_cpu_cpp_builtins (struct cpp_reader *);
+@@ -344,184 +352,6 @@ extern void arm_cpu_cpp_builtins (struct cpp_reader *);
extern bool arm_is_constant_pool_ref (rtx);
@@ -52336,7 +52392,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* The bits in this mask specify which
instructions we are allowed to generate. */
extern arm_feature_set insn_flags;
-@@ -601,6 +432,9 @@ extern int arm_tune_cortex_a9;
+@@ -601,6 +431,9 @@ extern int arm_tune_cortex_a9;
interworking clean. */
extern int arm_cpp_interwork;
@@ -52466,7 +52522,20 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT);
static void arm_output_function_prologue (FILE *, HOST_WIDE_INT);
static int arm_comp_type_attributes (const_tree, const_tree);
-@@ -249,8 +252,6 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
+@@ -164,12 +167,6 @@ static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
+ static bool arm_have_conditional_execution (void);
+ static bool arm_cannot_force_const_mem (machine_mode, rtx);
+ static bool arm_legitimate_constant_p (machine_mode, rtx);
+-static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
+-static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
+-static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
+-static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
+-static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
+-static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
+ static bool arm_rtx_costs (rtx, machine_mode, int, int, int *, bool);
+ static int arm_address_cost (rtx, machine_mode, addr_space_t, bool);
+ static int arm_register_move_cost (machine_mode, reg_class_t, reg_class_t);
+@@ -249,8 +246,6 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
static bool arm_output_addr_const_extra (FILE *, rtx);
static bool arm_allocate_stack_slots_for_args (void);
static bool arm_warn_func_return (tree);
@@ -52475,7 +52544,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static tree arm_promoted_type (const_tree t);
static tree arm_convert_to_type (tree type, tree expr);
static bool arm_scalar_mode_supported_p (machine_mode);
-@@ -300,6 +301,9 @@ static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+@@ -300,6 +295,9 @@ static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
@@ -52485,7 +52554,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Table of machine attributes. */
static const struct attribute_spec arm_attribute_table[] =
-@@ -343,6 +347,11 @@ static const struct attribute_spec arm_attribute_table[] =
+@@ -343,6 +341,11 @@ static const struct attribute_spec arm_attribute_table[] =
{ "notshared", 0, 0, false, true, false, arm_handle_notshared_attribute,
false },
#endif
@@ -52497,7 +52566,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{ NULL, 0, 0, false, false, false, NULL, false }
};
-@@ -463,7 +472,7 @@ static const struct attribute_spec arm_attribute_table[] =
+@@ -463,7 +466,7 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_ASM_OUTPUT_MI_THUNK
#define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk
#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
@@ -52506,7 +52575,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#undef TARGET_RTX_COSTS
#define TARGET_RTX_COSTS arm_rtx_costs
-@@ -654,12 +663,6 @@ static const struct attribute_spec arm_attribute_table[] =
+@@ -654,12 +657,6 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_PREFERRED_RELOAD_CLASS
#define TARGET_PREFERRED_RELOAD_CLASS arm_preferred_reload_class
@@ -52519,7 +52588,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#undef TARGET_PROMOTED_TYPE
#define TARGET_PROMOTED_TYPE arm_promoted_type
-@@ -820,6 +823,13 @@ int arm_arch8 = 0;
+@@ -820,6 +817,13 @@ int arm_arch8 = 0;
/* Nonzero if this chip supports the ARMv8.1 extensions. */
int arm_arch8_1 = 0;
@@ -52533,7 +52602,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Nonzero if this chip can benefit from load scheduling. */
int arm_ld_sched = 0;
-@@ -852,6 +862,9 @@ int arm_tune_cortex_a9 = 0;
+@@ -852,6 +856,9 @@ int arm_tune_cortex_a9 = 0;
interworking clean. */
int arm_cpp_interwork = 0;
@@ -52543,7 +52612,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Nonzero if chip supports Thumb 2. */
int arm_arch_thumb2;
-@@ -892,6 +905,9 @@ int arm_condexec_masklen = 0;
+@@ -892,6 +899,9 @@ int arm_condexec_masklen = 0;
/* Nonzero if chip supports the ARMv8 CRC instructions. */
int arm_arch_crc = 0;
@@ -52553,13 +52622,154 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Nonzero if the core has a very small, high-latency, multiply unit. */
int arm_m_profile_small_mul = 0;
-@@ -2055,6 +2071,29 @@ const struct tune_params arm_xgene1_tune =
+@@ -1684,8 +1694,7 @@ const struct cpu_cost_table v7m_extra_costs =
+
+ const struct tune_params arm_slowmul_tune =
+ {
+- arm_slowmul_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1707,8 +1716,7 @@ const struct tune_params arm_slowmul_tune =
+
+ const struct tune_params arm_fastmul_tune =
+ {
+- arm_fastmul_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1733,8 +1741,7 @@ const struct tune_params arm_fastmul_tune =
+
+ const struct tune_params arm_strongarm_tune =
+ {
+- arm_fastmul_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1756,8 +1763,7 @@ const struct tune_params arm_strongarm_tune =
+
+ const struct tune_params arm_xscale_tune =
+ {
+- arm_xscale_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ xscale_sched_adjust_cost,
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1779,8 +1785,7 @@ const struct tune_params arm_xscale_tune =
+
+ const struct tune_params arm_9e_tune =
+ {
+- arm_9e_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1802,8 +1807,7 @@ const struct tune_params arm_9e_tune =
+
+ const struct tune_params arm_marvell_pj4_tune =
+ {
+- arm_9e_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1825,8 +1829,7 @@ const struct tune_params arm_marvell_pj4_tune =
+
+ const struct tune_params arm_v6t2_tune =
+ {
+- arm_9e_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -1850,7 +1853,6 @@ const struct tune_params arm_v6t2_tune =
+ /* Generic Cortex tuning. Use more specific tunings if appropriate. */
+ const struct tune_params arm_cortex_tune =
+ {
+- arm_9e_rtx_costs,
+ &generic_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -1873,7 +1875,6 @@ const struct tune_params arm_cortex_tune =
+
+ const struct tune_params arm_cortex_a8_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa8_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -1896,7 +1897,6 @@ const struct tune_params arm_cortex_a8_tune =
+
+ const struct tune_params arm_cortex_a7_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa7_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -1919,7 +1919,6 @@ const struct tune_params arm_cortex_a7_tune =
+
+ const struct tune_params arm_cortex_a15_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa15_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -1942,7 +1941,6 @@ const struct tune_params arm_cortex_a15_tune =
+
+ const struct tune_params arm_cortex_a35_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa53_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -1965,7 +1963,6 @@ const struct tune_params arm_cortex_a35_tune =
+
+ const struct tune_params arm_cortex_a53_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa53_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -1988,7 +1985,6 @@ const struct tune_params arm_cortex_a53_tune =
+
+ const struct tune_params arm_cortex_a57_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa57_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -2011,7 +2007,6 @@ const struct tune_params arm_cortex_a57_tune =
+
+ const struct tune_params arm_exynosm1_tune =
+ {
+- arm_9e_rtx_costs,
+ &exynosm1_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -2034,7 +2029,6 @@ const struct tune_params arm_exynosm1_tune =
+
+ const struct tune_params arm_xgene1_tune =
+ {
+- arm_9e_rtx_costs,
+ &xgene1_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -2055,12 +2049,33 @@ const struct tune_params arm_xgene1_tune =
tune_params::SCHED_AUTOPREF_OFF
};
+const struct tune_params arm_qdf24xx_tune =
+{
-+ arm_9e_rtx_costs,
+ &qdf24xx_extra_costs,
+ NULL, /* Scheduler cost adjustment. */
+ arm_default_branch_cost,
@@ -52583,13 +52793,34 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
less appealing. Set max_insns_skipped to a low value. */
-@@ -2127,6 +2166,29 @@ const struct tune_params arm_cortex_a12_tune =
+ const struct tune_params arm_cortex_a5_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa5_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_cortex_a5_branch_cost,
+@@ -2083,7 +2098,6 @@ const struct tune_params arm_cortex_a5_tune =
+
+ const struct tune_params arm_cortex_a9_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa9_extra_costs,
+ cortex_a9_sched_adjust_cost,
+ arm_default_branch_cost,
+@@ -2106,7 +2120,6 @@ const struct tune_params arm_cortex_a9_tune =
+
+ const struct tune_params arm_cortex_a12_tune =
+ {
+- arm_9e_rtx_costs,
+ &cortexa12_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+@@ -2127,6 +2140,28 @@ const struct tune_params arm_cortex_a12_tune =
tune_params::SCHED_AUTOPREF_OFF
};
+const struct tune_params arm_cortex_a73_tune =
+{
-+ arm_9e_rtx_costs,
+ &cortexa57_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
@@ -52613,7 +52844,23 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
cycle to execute each. An LDR from the constant pool also takes two cycles
to execute, but mildly increases pipelining opportunity (consecutive
-@@ -2183,7 +2245,8 @@ const struct tune_params arm_cortex_m7_tune =
+@@ -2136,7 +2171,6 @@ const struct tune_params arm_cortex_a12_tune =
+
+ const struct tune_params arm_v7m_tune =
+ {
+- arm_9e_rtx_costs,
+ &v7m_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_cortex_m_branch_cost,
+@@ -2161,7 +2195,6 @@ const struct tune_params arm_v7m_tune =
+
+ const struct tune_params arm_cortex_m7_tune =
+ {
+- arm_9e_rtx_costs,
+ &v7m_extra_costs,
+ NULL, /* Sched adj cost. */
+ arm_cortex_m7_branch_cost,
+@@ -2183,11 +2216,11 @@ const struct tune_params arm_cortex_m7_tune =
};
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@@ -52622,8 +52869,23 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ cortex-m23. */
const struct tune_params arm_v6m_tune =
{
- arm_9e_rtx_costs,
-@@ -2264,16 +2327,18 @@ static const struct processors *arm_selected_arch;
+- arm_9e_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ NULL, /* Sched adj cost. */
+ arm_default_branch_cost,
+ &arm_default_vec_cost, /* Vectorizer costs. */
+@@ -2209,8 +2242,7 @@ const struct tune_params arm_v6m_tune =
+
+ const struct tune_params arm_fa726te_tune =
+ {
+- arm_9e_rtx_costs,
+- NULL, /* Insn extra costs. */
++ &generic_extra_costs, /* Insn extra costs. */
+ fa726te_sched_adjust_cost,
+ arm_default_branch_cost,
+ &arm_default_vec_cost,
+@@ -2264,16 +2296,18 @@ static const struct processors *arm_selected_arch;
static const struct processors *arm_selected_cpu;
static const struct processors *arm_selected_tune;
@@ -52646,7 +52908,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#include "arm-fpus.def"
#undef ARM_FPU
};
-@@ -2752,8 +2817,8 @@ arm_option_check_internal (struct gcc_options *opts)
+@@ -2752,8 +2786,8 @@ arm_option_check_internal (struct gcc_options *opts)
const struct arm_fpu_desc *fpu_desc = &all_fpus[opts->x_arm_fpu_index];
/* iWMMXt and NEON are incompatible. */
@@ -52657,7 +52919,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
error ("iWMMXt and NEON are incompatible");
/* Make sure that the processor choice does not conflict with any of the
-@@ -2907,7 +2972,8 @@ arm_option_override_internal (struct gcc_options *opts,
+@@ -2907,7 +2941,8 @@ arm_option_override_internal (struct gcc_options *opts,
if (! opts_set->x_arm_restrict_it)
opts->x_arm_restrict_it = arm_arch8;
@@ -52667,7 +52929,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
opts->x_arm_restrict_it = 0;
/* Enable -munaligned-access by default for
-@@ -2918,7 +2984,8 @@ arm_option_override_internal (struct gcc_options *opts,
+@@ -2918,7 +2953,8 @@ arm_option_override_internal (struct gcc_options *opts,
Disable -munaligned-access by default for
- all pre-ARMv6 architecture-based processors
@@ -52677,7 +52939,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (! opts_set->x_unaligned_access)
{
-@@ -3170,6 +3237,8 @@ arm_option_override (void)
+@@ -3170,6 +3206,8 @@ arm_option_override (void)
arm_arch7em = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH7EM);
arm_arch8 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH8);
arm_arch8_1 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_1);
@@ -52686,7 +52948,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
arm_arch_thumb2 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB2);
arm_arch_xscale = ARM_FSET_HAS_CPU1 (insn_flags, FL_XSCALE);
-@@ -3184,7 +3253,15 @@ arm_option_override (void)
+@@ -3184,7 +3222,15 @@ arm_option_override (void)
arm_arch_no_volatile_ce = ARM_FSET_HAS_CPU1 (insn_flags, FL_NO_VOLATILE_CE);
arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0;
arm_arch_crc = ARM_FSET_HAS_CPU1 (insn_flags, FL_CRC32);
@@ -52702,7 +52964,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* V5 code we generate is completely interworking capable, so we turn off
TARGET_INTERWORK here to avoid many tests later on. */
-@@ -3222,10 +3299,8 @@ arm_option_override (void)
+@@ -3222,10 +3268,8 @@ arm_option_override (void)
/* If soft-float is specified then don't use FPU. */
if (TARGET_SOFT_FLOAT)
arm_fpu_attr = FPU_NONE;
@@ -52714,7 +52976,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (TARGET_AAPCS_BASED)
{
-@@ -3245,15 +3320,14 @@ arm_option_override (void)
+@@ -3245,15 +3289,14 @@ arm_option_override (void)
if (arm_abi == ARM_ABI_IWMMXT)
arm_pcs_default = ARM_PCS_AAPCS_IWMMXT;
else if (arm_float_abi == ARM_FLOAT_ABI_HARD
@@ -52732,7 +52994,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
sorry ("-mfloat-abi=hard and VFP");
if (arm_abi == ARM_ABI_APCS)
-@@ -3298,6 +3372,20 @@ arm_option_override (void)
+@@ -3298,6 +3341,20 @@ arm_option_override (void)
}
}
@@ -52753,7 +53015,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* If stack checking is disabled, we can use r10 as the PIC register,
which keeps r9 available. The EABI specifies r9 as the PIC register. */
if (flag_pic && TARGET_SINGLE_PIC_BASE)
-@@ -3329,10 +3417,6 @@ arm_option_override (void)
+@@ -3329,10 +3386,6 @@ arm_option_override (void)
arm_pic_register = pic_register;
}
@@ -52764,7 +53026,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores. */
if (fix_cm3_ldrd == 2)
{
-@@ -3436,6 +3520,9 @@ arm_option_override (void)
+@@ -3436,6 +3489,9 @@ arm_option_override (void)
if (target_slow_flash_data)
arm_disable_literal_pool = true;
@@ -52774,7 +53036,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Disable scheduling fusion by default if it's not armv7 processor
or doesn't prefer ldrd/strd. */
if (flag_schedule_fusion == 2
-@@ -3568,6 +3655,9 @@ arm_compute_func_type (void)
+@@ -3568,6 +3624,9 @@ arm_compute_func_type (void)
else
type |= arm_isr_value (TREE_VALUE (a));
@@ -52784,7 +53046,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return type;
}
-@@ -3794,6 +3884,11 @@ use_return_insn (int iscond, rtx sibling)
+@@ -3794,6 +3853,11 @@ use_return_insn (int iscond, rtx sibling)
return 0;
}
@@ -52796,7 +53058,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* If there are saved registers but the LR isn't saved, then we need
two instructions for the return. */
if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM)))
-@@ -3801,7 +3896,7 @@ use_return_insn (int iscond, rtx sibling)
+@@ -3801,7 +3865,7 @@ use_return_insn (int iscond, rtx sibling)
/* Can't be done if any of the VFP regs are pushed,
since this also requires an insn. */
@@ -52805,7 +53067,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
for (regno = FIRST_VFP_REGNUM; regno <= LAST_VFP_REGNUM; regno++)
if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
return 0;
-@@ -3899,7 +3994,7 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
+@@ -3899,7 +3963,7 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
{
case SET:
/* See if we can use movw. */
@@ -52814,7 +53076,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return 1;
else
/* Otherwise, try mvn. */
-@@ -4118,7 +4213,7 @@ optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
+@@ -4118,7 +4182,7 @@ optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
yield a shorter sequence, we may as well use zero. */
insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start);
if (best_start != 0
@@ -52823,7 +53085,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
insns2 = optimal_immediate_sequence_1 (code, val, &tmp_sequence, 0);
if (insns2 <= insns1)
-@@ -4949,7 +5044,7 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+@@ -4949,7 +5013,7 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
if (mode == VOIDmode)
mode = GET_MODE (*op1);
@@ -52832,7 +53094,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* For DImode, we have GE/LT/GEU/LTU comparisons. In ARM mode
we can also use cmp/cmpeq for GTU/LEU. GT/LE must be either
-@@ -5255,7 +5350,6 @@ arm_function_value_regno_p (const unsigned int regno)
+@@ -5255,7 +5319,6 @@ arm_function_value_regno_p (const unsigned int regno)
if (regno == ARG_REGISTER (1)
|| (TARGET_32BIT
&& TARGET_AAPCS_BASED
@@ -52840,7 +53102,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& TARGET_HARD_FLOAT
&& regno == FIRST_VFP_REGNUM)
|| (TARGET_IWMMXT_ABI
-@@ -5274,7 +5368,7 @@ arm_apply_result_size (void)
+@@ -5274,7 +5337,7 @@ arm_apply_result_size (void)
if (TARGET_32BIT)
{
@@ -52849,7 +53111,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
size += 32;
if (TARGET_IWMMXT_ABI)
size += 8;
-@@ -5549,7 +5643,7 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
+@@ -5549,7 +5612,7 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
{
case REAL_TYPE:
mode = TYPE_MODE (type);
@@ -52858,7 +53120,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return -1;
if (*modep == VOIDmode)
-@@ -5722,7 +5816,7 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
+@@ -5722,7 +5785,7 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
if (pcs_variant != ARM_PCS_AAPCS_LOCAL)
return false;
@@ -52867,7 +53129,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(TARGET_VFP_DOUBLE || !is_double));
}
-@@ -5797,11 +5891,16 @@ aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
+@@ -5797,11 +5860,16 @@ aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
&pcum->aapcs_vfp_rcount);
}
@@ -52885,7 +53147,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1;
int regno;
-@@ -5850,6 +5949,9 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
+@@ -5850,6 +5918,9 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
return false;
}
@@ -52895,7 +53157,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static rtx
aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
machine_mode mode,
-@@ -5940,13 +6042,13 @@ static struct
+@@ -5940,13 +6011,13 @@ static struct
required for a return from FUNCTION_ARG. */
bool (*allocate) (CUMULATIVE_ARGS *, machine_mode, const_tree);
@@ -52914,7 +53176,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
rtx (*allocate_return_reg) (enum arm_pcs, machine_mode, const_tree);
/* Finish processing this argument and prepare to start processing
-@@ -6561,6 +6663,185 @@ arm_handle_notshared_attribute (tree *node,
+@@ -6561,6 +6632,185 @@ arm_handle_notshared_attribute (tree *node,
}
#endif
@@ -53100,7 +53362,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Return 0 if the attributes for two types are incompatible, 1 if they
are compatible, and 2 if they are nearly compatible (which causes a
warning to be generated). */
-@@ -6601,6 +6882,14 @@ arm_comp_type_attributes (const_tree type1, const_tree type2)
+@@ -6601,6 +6851,14 @@ arm_comp_type_attributes (const_tree type1, const_tree type2)
if (l1 != l2)
return 0;
@@ -53115,7 +53377,16 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return 1;
}
-@@ -6719,6 +7008,20 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
+@@ -6711,7 +6969,7 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
+ may be used both as target of the call and base register for restoring
+ the VFP registers */
+ if (TARGET_APCS_FRAME && TARGET_ARM
+- && TARGET_HARD_FLOAT && TARGET_VFP
++ && TARGET_HARD_FLOAT
+ && decl && arm_is_long_call_p (decl))
+ return false;
+
+@@ -6727,6 +6985,20 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
if (IS_INTERRUPT (func_type))
return false;
@@ -53136,7 +53407,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
{
/* Check that the return value locations are the same. For
-@@ -7175,8 +7478,7 @@ arm_legitimate_address_outer_p (machine_mode mode, rtx x, RTX_CODE outer,
+@@ -7187,8 +7459,7 @@ arm_legitimate_address_outer_p (machine_mode mode, rtx x, RTX_CODE outer,
return 1;
use_ldrd = (TARGET_LDRD
@@ -53146,7 +53417,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (code == POST_INC || code == PRE_DEC
|| ((code == PRE_INC || code == POST_DEC)
-@@ -7261,8 +7563,7 @@ thumb2_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
+@@ -7273,8 +7544,7 @@ thumb2_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
return 1;
use_ldrd = (TARGET_LDRD
@@ -53156,7 +53427,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (code == POST_INC || code == PRE_DEC
|| ((code == PRE_INC || code == POST_DEC)
-@@ -7355,7 +7656,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, RTX_CODE outer,
+@@ -7367,7 +7637,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, RTX_CODE outer,
/* Standard coprocessor addressing modes. */
if (TARGET_HARD_FLOAT
@@ -53164,7 +53435,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& (mode == SFmode || mode == DFmode))
return (code == CONST_INT && INTVAL (index) < 1024
&& INTVAL (index) > -1024
-@@ -7475,7 +7775,6 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, int strict_p)
+@@ -7487,7 +7756,6 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, int strict_p)
/* ??? Combine arm and thumb2 coprocessor addressing modes. */
/* Standard coprocessor addressing modes. */
if (TARGET_HARD_FLOAT
@@ -53172,7 +53443,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& (mode == SFmode || mode == DFmode))
return (code == CONST_INT && INTVAL (index) < 1024
/* Thumb-2 allows only > -256 index range for it's core register
-@@ -8021,8 +8320,7 @@ arm_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
+@@ -8033,8 +8301,7 @@ arm_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
/* VFP addressing modes actually allow greater offsets, but for
now we just stick with the lowest common denominator. */
@@ -53182,7 +53453,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
low_n = n & 0x0f;
n &= ~0x0f;
-@@ -8214,6 +8512,12 @@ arm_legitimate_constant_p_1 (machine_mode, rtx x)
+@@ -8226,6 +8493,12 @@ arm_legitimate_constant_p_1 (machine_mode, rtx x)
static bool
thumb_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
{
@@ -53195,7 +53466,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return (CONST_INT_P (x)
|| CONST_DOUBLE_P (x)
|| CONSTANT_ADDRESS_P (x)
-@@ -8300,7 +8604,9 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -8312,7 +8585,9 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
case CONST_INT:
if (outer == SET)
{
@@ -53206,7 +53477,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return 0;
if (thumb_shiftable_const (INTVAL (x)))
return COSTS_N_INSNS (2);
-@@ -8317,8 +8623,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -8329,8 +8604,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
int i;
/* This duplicates the tests in the andsi3 expander. */
for (i = 9; i <= 31; i++)
@@ -53217,24 +53488,114 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return COSTS_N_INSNS (2);
}
else if (outer == ASHIFT || outer == ASHIFTRT
-@@ -9003,7 +9309,7 @@ static inline int
- thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+@@ -8393,1006 +8668,162 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+ }
+ }
+
+-static inline bool
+-arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
++/* Estimates the size cost of thumb1 instructions.
++ For now most of the code is copied from thumb1_rtx_costs. We need more
++ fine grain tuning when we have more related test cases. */
++static inline int
++thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
{
machine_mode mode = GET_MODE (x);
-- int words;
+- enum rtx_code subcode;
+- rtx operand;
+- enum rtx_code code = GET_CODE (x);
+- *total = 0;
+ int words, cost;
switch (code)
{
-@@ -9049,17 +9355,27 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
- /* A SET doesn't have a mode, so let's look at the SET_DEST to get
- the mode. */
- words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
-- return COSTS_N_INSNS (words)
-- + COSTS_N_INSNS (1) * (satisfies_constraint_J (SET_SRC (x))
-- || satisfies_constraint_K (SET_SRC (x))
-- /* thumb1_movdi_insn. */
-- || ((words > 1) && MEM_P (SET_SRC (x))));
+- case MEM:
+- /* Memory costs quite a lot for the first word, but subsequent words
+- load at the equivalent of a single insn each. */
+- *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+- return true;
+-
+- case DIV:
+- case MOD:
+- case UDIV:
+- case UMOD:
+- if (TARGET_HARD_FLOAT && mode == SFmode)
+- *total = COSTS_N_INSNS (2);
+- else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE)
+- *total = COSTS_N_INSNS (4);
+- else
+- *total = COSTS_N_INSNS (20);
+- return false;
+-
+- case ROTATE:
+- if (REG_P (XEXP (x, 1)))
+- *total = COSTS_N_INSNS (1); /* Need to subtract from 32 */
+- else if (!CONST_INT_P (XEXP (x, 1)))
+- *total = rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+-
+- /* Fall through */
++ case ASHIFT:
++ case ASHIFTRT:
++ case LSHIFTRT:
+ case ROTATERT:
+- if (mode != SImode)
+- {
+- *total += COSTS_N_INSNS (4);
+- return true;
+- }
++ return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
+
+- /* Fall through */
+- case ASHIFT: case LSHIFTRT: case ASHIFTRT:
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- if (mode == DImode)
+- {
+- *total += COSTS_N_INSNS (3);
+- return true;
+- }
++ case PLUS:
++ case MINUS:
++ /* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
++ defined by RTL expansion, especially for the expansion of
++ multiplication. */
++ if ((GET_CODE (XEXP (x, 0)) == MULT
++ && power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
++ || (GET_CODE (XEXP (x, 1)) == MULT
++ && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
++ return COSTS_N_INSNS (2);
++ /* On purpose fall through for normal RTX. */
++ case COMPARE:
++ case NEG:
++ case NOT:
++ return COSTS_N_INSNS (1);
+
+- *total += COSTS_N_INSNS (1);
+- /* Increase the cost of complex shifts because they aren't any faster,
+- and reduce dual issue opportunities. */
+- if (arm_tune_cortex_a9
+- && outer != SET && !CONST_INT_P (XEXP (x, 1)))
+- ++*total;
++ case MULT:
++ if (CONST_INT_P (XEXP (x, 1)))
++ {
++ /* Thumb1 mul instruction can't operate on const. We must Load it
++ into a register first. */
++ int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
++ /* For the targets which have a very small and high-latency multiply
++ unit, we prefer to synthesize the mult with up to 5 instructions,
++ giving a good balance between size and performance. */
++ if (arm_arch6m && arm_m_profile_small_mul)
++ return COSTS_N_INSNS (5);
++ else
++ return COSTS_N_INSNS (1) + const_size;
++ }
++ return COSTS_N_INSNS (1);
+
+- return true;
++ case SET:
++ /* A SET doesn't have a mode, so let's look at the SET_DEST to get
++ the mode. */
++ words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
+ cost = COSTS_N_INSNS (words);
+ if (satisfies_constraint_J (SET_SRC (x))
+ || satisfies_constraint_K (SET_SRC (x))
@@ -53248,30 +53609,1036 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ cost += COSTS_N_INSNS (1);
+ return cost;
- case CONST_INT:
- if (outer == SET)
- {
-- if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
+- case MINUS:
+- if (mode == DImode)
+- {
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- if (CONST_INT_P (XEXP (x, 0))
+- && const_ok_for_arm (INTVAL (XEXP (x, 0))))
+- {
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- return true;
+- }
++ case CONST_INT:
++ if (outer == SET)
++ {
+ if (UINTVAL (x) < 256)
- return COSTS_N_INSNS (1);
++ return COSTS_N_INSNS (1);
+ /* movw is 4byte long. */
+ if (TARGET_HAVE_MOVT && !(INTVAL (x) & 0xffff0000))
+ return COSTS_N_INSNS (2);
- /* See split "TARGET_THUMB1 && satisfies_constraint_J". */
- if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
- return COSTS_N_INSNS (2);
-@@ -9079,8 +9395,8 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
- int i;
- /* This duplicates the tests in the andsi3 expander. */
- for (i = 9; i <= 31; i++)
-- if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
-- || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
++ /* See split "TARGET_THUMB1 && satisfies_constraint_J". */
++ if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
++ return COSTS_N_INSNS (2);
++ /* See split "TARGET_THUMB1 && satisfies_constraint_K". */
++ if (thumb_shiftable_const (INTVAL (x)))
++ return COSTS_N_INSNS (2);
++ return COSTS_N_INSNS (3);
++ }
++ else if ((outer == PLUS || outer == COMPARE)
++ && INTVAL (x) < 256 && INTVAL (x) > -256)
++ return 0;
++ else if ((outer == IOR || outer == XOR || outer == AND)
++ && INTVAL (x) < 256 && INTVAL (x) >= -256)
++ return COSTS_N_INSNS (1);
++ else if (outer == AND)
++ {
++ int i;
++ /* This duplicates the tests in the andsi3 expander. */
++ for (i = 9; i <= 31; i++)
+ if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (x)
+ || (HOST_WIDE_INT_1 << i) - 1 == ~INTVAL (x))
- return COSTS_N_INSNS (2);
- }
- else if (outer == ASHIFT || outer == ASHIFTRT
-@@ -10759,8 +11075,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
++ return COSTS_N_INSNS (2);
++ }
++ else if (outer == ASHIFT || outer == ASHIFTRT
++ || outer == LSHIFTRT)
++ return 0;
++ return COSTS_N_INSNS (2);
+
+- if (CONST_INT_P (XEXP (x, 1))
+- && const_ok_for_arm (INTVAL (XEXP (x, 1))))
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+- }
++ case CONST:
++ case CONST_DOUBLE:
++ case LABEL_REF:
++ case SYMBOL_REF:
++ return COSTS_N_INSNS (3);
+
+- return false;
+- }
+-
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+- {
+- if (TARGET_HARD_FLOAT
+- && (mode == SFmode
+- || (mode == DFmode && !TARGET_VFP_SINGLE)))
+- {
+- *total = COSTS_N_INSNS (1);
+- if (CONST_DOUBLE_P (XEXP (x, 0))
+- && arm_const_double_rtx (XEXP (x, 0)))
+- {
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- return true;
+- }
+-
+- if (CONST_DOUBLE_P (XEXP (x, 1))
+- && arm_const_double_rtx (XEXP (x, 1)))
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+- }
+-
+- return false;
+- }
+- *total = COSTS_N_INSNS (20);
+- return false;
+- }
+-
+- *total = COSTS_N_INSNS (1);
+- if (CONST_INT_P (XEXP (x, 0))
+- && const_ok_for_arm (INTVAL (XEXP (x, 0))))
+- {
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- return true;
+- }
+-
+- subcode = GET_CODE (XEXP (x, 1));
+- if (subcode == ASHIFT || subcode == ASHIFTRT
+- || subcode == LSHIFTRT
+- || subcode == ROTATE || subcode == ROTATERT)
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- *total += rtx_cost (XEXP (XEXP (x, 1), 0), mode, subcode, 0, speed);
+- return true;
+- }
+-
+- /* A shift as a part of RSB costs no more than RSB itself. */
+- if (GET_CODE (XEXP (x, 0)) == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
+- {
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, code, 0, speed);
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- return true;
+- }
+-
+- if (subcode == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode))
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- *total += rtx_cost (XEXP (XEXP (x, 1), 0), mode, subcode, 0, speed);
+- return true;
+- }
+-
+- if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE
+- || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE)
+- {
+- *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, speed);
+- if (REG_P (XEXP (XEXP (x, 1), 0))
+- && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM)
+- *total += COSTS_N_INSNS (1);
+-
+- return true;
+- }
+-
+- /* Fall through */
+-
+- case PLUS:
+- if (code == PLUS && arm_arch6 && mode == SImode
+- && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
+- || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
+- {
+- *total = COSTS_N_INSNS (1);
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), VOIDmode,
+- GET_CODE (XEXP (x, 0)), 0, speed);
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- return true;
+- }
+-
+- /* MLA: All arguments must be registers. We filter out
+- multiplication by a power of two, so that we fall down into
+- the code below. */
+- if (GET_CODE (XEXP (x, 0)) == MULT
+- && !power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
+- {
+- /* The cost comes from the cost of the multiply. */
+- return false;
+- }
+-
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+- {
+- if (TARGET_HARD_FLOAT
+- && (mode == SFmode
+- || (mode == DFmode && !TARGET_VFP_SINGLE)))
+- {
+- *total = COSTS_N_INSNS (1);
+- if (CONST_DOUBLE_P (XEXP (x, 1))
+- && arm_const_double_rtx (XEXP (x, 1)))
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+- }
+-
+- return false;
+- }
+-
+- *total = COSTS_N_INSNS (20);
+- return false;
+- }
+-
+- if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
+- || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
+- {
+- *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), mode, code,
+- 1, speed);
+- if (REG_P (XEXP (XEXP (x, 0), 0))
+- && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM)
+- *total += COSTS_N_INSNS (1);
+- return true;
+- }
+-
+- /* Fall through */
+-
+- case AND: case XOR: case IOR:
+-
+- /* Normally the frame registers will be spilt into reg+const during
+- reload, so it is a bad idea to combine them with other instructions,
+- since then they might not be moved outside of loops. As a compromise
+- we allow integration with ops that have a constant as their second
+- operand. */
+- if (REG_OR_SUBREG_REG (XEXP (x, 0))
+- && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
+- && !CONST_INT_P (XEXP (x, 1)))
+- *total = COSTS_N_INSNS (1);
+-
+- if (mode == DImode)
+- {
+- *total += COSTS_N_INSNS (2);
+- if (CONST_INT_P (XEXP (x, 1))
+- && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+- }
+-
+- return false;
+- }
+-
+- *total += COSTS_N_INSNS (1);
+- if (CONST_INT_P (XEXP (x, 1))
+- && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
+- {
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+- }
+- subcode = GET_CODE (XEXP (x, 0));
+- if (subcode == ASHIFT || subcode == ASHIFTRT
+- || subcode == LSHIFTRT
+- || subcode == ROTATE || subcode == ROTATERT)
+- {
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
+- return true;
+- }
+-
+- if (subcode == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
+- {
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
+- return true;
+- }
+-
+- if (subcode == UMIN || subcode == UMAX
+- || subcode == SMIN || subcode == SMAX)
+- {
+- *total = COSTS_N_INSNS (3);
+- return true;
+- }
+-
+- return false;
+-
+- case MULT:
+- /* This should have been handled by the CPU specific routines. */
+- gcc_unreachable ();
+-
+- case TRUNCATE:
+- if (arm_arch3m && mode == SImode
+- && GET_CODE (XEXP (x, 0)) == LSHIFTRT
+- && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+- && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0))
+- == GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)))
+- && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
+- || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND))
+- {
+- *total = rtx_cost (XEXP (XEXP (x, 0), 0), VOIDmode, LSHIFTRT,
+- 0, speed);
+- return true;
+- }
+- *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */
+- return false;
+-
+- case NEG:
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+- {
+- if (TARGET_HARD_FLOAT
+- && (mode == SFmode
+- || (mode == DFmode && !TARGET_VFP_SINGLE)))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+- *total = COSTS_N_INSNS (2);
+- return false;
+- }
+-
+- /* Fall through */
+- case NOT:
+- *total = COSTS_N_INSNS (ARM_NUM_REGS(mode));
+- if (mode == SImode && code == NOT)
+- {
+- subcode = GET_CODE (XEXP (x, 0));
+- if (subcode == ASHIFT || subcode == ASHIFTRT
+- || subcode == LSHIFTRT
+- || subcode == ROTATE || subcode == ROTATERT
+- || (subcode == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)))
+- {
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode,
+- 0, speed);
+- /* Register shifts cost an extra cycle. */
+- if (!CONST_INT_P (XEXP (XEXP (x, 0), 1)))
+- *total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1),
+- mode, subcode,
+- 1, speed);
+- return true;
+- }
+- }
+-
+- return false;
+-
+- case IF_THEN_ELSE:
+- if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
+- {
+- *total = COSTS_N_INSNS (4);
+- return true;
+- }
+-
+- operand = XEXP (x, 0);
+-
+- if (!((GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMPARE
+- || GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMM_COMPARE)
+- && REG_P (XEXP (operand, 0))
+- && REGNO (XEXP (operand, 0)) == CC_REGNUM))
+- *total += COSTS_N_INSNS (1);
+- *total += rtx_cost (XEXP (x, 1), VOIDmode, code, 1, speed);
+- *total += rtx_cost (XEXP (x, 2), VOIDmode, code, 2, speed);
+- return true;
+-
+- case NE:
+- if (mode == SImode && XEXP (x, 1) == const0_rtx)
+- {
+- *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, speed);
+- return true;
+- }
+- goto scc_insn;
+-
+- case GE:
+- if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
+- && mode == SImode && XEXP (x, 1) == const0_rtx)
+- {
+- *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, speed);
+- return true;
+- }
+- goto scc_insn;
+-
+- case LT:
+- if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
+- && mode == SImode && XEXP (x, 1) == const0_rtx)
+- {
+- *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, speed);
+- return true;
+- }
+- goto scc_insn;
+-
+- case EQ:
+- case GT:
+- case LE:
+- case GEU:
+- case LTU:
+- case GTU:
+- case LEU:
+- case UNORDERED:
+- case ORDERED:
+- case UNEQ:
+- case UNGE:
+- case UNLT:
+- case UNGT:
+- case UNLE:
+- scc_insn:
+- /* SCC insns. In the case where the comparison has already been
+- performed, then they cost 2 instructions. Otherwise they need
+- an additional comparison before them. */
+- *total = COSTS_N_INSNS (2);
+- if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
+- {
+- return true;
+- }
+-
+- /* Fall through */
+- case COMPARE:
+- if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
+- {
+- *total = 0;
+- return true;
+- }
+-
+- *total += COSTS_N_INSNS (1);
+- if (CONST_INT_P (XEXP (x, 1))
+- && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
+- {
+- *total += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
+- return true;
+- }
+-
+- subcode = GET_CODE (XEXP (x, 0));
+- if (subcode == ASHIFT || subcode == ASHIFTRT
+- || subcode == LSHIFTRT
+- || subcode == ROTATE || subcode == ROTATERT)
+- {
+- mode = GET_MODE (XEXP (x, 0));
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
+- return true;
+- }
+-
+- if (subcode == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
+- {
+- mode = GET_MODE (XEXP (x, 0));
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
+- return true;
+- }
+-
+- return false;
+-
+- case UMIN:
+- case UMAX:
+- case SMIN:
+- case SMAX:
+- *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- if (!CONST_INT_P (XEXP (x, 1))
+- || !const_ok_for_arm (INTVAL (XEXP (x, 1))))
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+- return true;
+-
+- case ABS:
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+- {
+- if (TARGET_HARD_FLOAT
+- && (mode == SFmode
+- || (mode == DFmode && !TARGET_VFP_SINGLE)))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+- *total = COSTS_N_INSNS (20);
+- return false;
+- }
+- *total = COSTS_N_INSNS (1);
+- if (mode == DImode)
+- *total += COSTS_N_INSNS (3);
+- return false;
+-
+- case SIGN_EXTEND:
+- case ZERO_EXTEND:
+- *total = 0;
+- if (GET_MODE_CLASS (mode) == MODE_INT)
+- {
+- rtx op = XEXP (x, 0);
+- machine_mode opmode = GET_MODE (op);
+-
+- if (mode == DImode)
+- *total += COSTS_N_INSNS (1);
+-
+- if (opmode != SImode)
+- {
+- if (MEM_P (op))
+- {
+- /* If !arm_arch4, we use one of the extendhisi2_mem
+- or movhi_bytes patterns for HImode. For a QImode
+- sign extension, we first zero-extend from memory
+- and then perform a shift sequence. */
+- if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND))
+- *total += COSTS_N_INSNS (2);
+- }
+- else if (arm_arch6)
+- *total += COSTS_N_INSNS (1);
+-
+- /* We don't have the necessary insn, so we need to perform some
+- other operation. */
+- else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode)
+- /* An and with constant 255. */
+- *total += COSTS_N_INSNS (1);
+- else
+- /* A shift sequence. Increase costs slightly to avoid
+- combining two shifts into an extend operation. */
+- *total += COSTS_N_INSNS (2) + 1;
+- }
+-
+- return false;
+- }
+-
+- switch (GET_MODE (XEXP (x, 0)))
+- {
+- case V8QImode:
+- case V4HImode:
+- case V2SImode:
+- case V4QImode:
+- case V2HImode:
+- *total = COSTS_N_INSNS (1);
+- return false;
+-
+- default:
+- gcc_unreachable ();
+- }
+- gcc_unreachable ();
+-
+- case ZERO_EXTRACT:
+- case SIGN_EXTRACT:
+- mode = GET_MODE (XEXP (x, 0));
+- *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+-
+- case CONST_INT:
+- if (const_ok_for_arm (INTVAL (x))
+- || const_ok_for_arm (~INTVAL (x)))
+- *total = COSTS_N_INSNS (1);
+- else
+- *total = COSTS_N_INSNS (arm_gen_constant (SET, mode, NULL_RTX,
+- INTVAL (x), NULL_RTX,
+- NULL_RTX, 0, 0));
+- return true;
+-
+- case CONST:
+- case LABEL_REF:
+- case SYMBOL_REF:
+- *total = COSTS_N_INSNS (3);
+- return true;
+-
+- case HIGH:
+- *total = COSTS_N_INSNS (1);
+- return true;
+-
+- case LO_SUM:
+- *total = COSTS_N_INSNS (1);
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+-
+- case CONST_DOUBLE:
+- if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x)
+- && (mode == SFmode || !TARGET_VFP_SINGLE))
+- *total = COSTS_N_INSNS (1);
+- else
+- *total = COSTS_N_INSNS (4);
+- return true;
+-
+- case SET:
+- /* The vec_extract patterns accept memory operands that require an
+- address reload. Account for the cost of that reload to give the
+- auto-inc-dec pass an incentive to try to replace them. */
+- if (TARGET_NEON && MEM_P (SET_DEST (x))
+- && GET_CODE (SET_SRC (x)) == VEC_SELECT)
+- {
+- mode = GET_MODE (SET_DEST (x));
+- *total = rtx_cost (SET_DEST (x), mode, code, 0, speed);
+- if (!neon_vector_mem_operand (SET_DEST (x), 2, true))
+- *total += COSTS_N_INSNS (1);
+- return true;
+- }
+- /* Likewise for the vec_set patterns. */
+- if (TARGET_NEON && GET_CODE (SET_SRC (x)) == VEC_MERGE
+- && GET_CODE (XEXP (SET_SRC (x), 0)) == VEC_DUPLICATE
+- && MEM_P (XEXP (XEXP (SET_SRC (x), 0), 0)))
+- {
+- rtx mem = XEXP (XEXP (SET_SRC (x), 0), 0);
+- mode = GET_MODE (SET_DEST (x));
+- *total = rtx_cost (mem, mode, code, 0, speed);
+- if (!neon_vector_mem_operand (mem, 2, true))
+- *total += COSTS_N_INSNS (1);
+- return true;
+- }
+- return false;
+-
+- case UNSPEC:
+- /* We cost this as high as our memory costs to allow this to
+- be hoisted from loops. */
+- if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
+- {
+- *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+- }
+- return true;
+-
+- case CONST_VECTOR:
+- if (TARGET_NEON
+- && TARGET_HARD_FLOAT
+- && outer == SET
+- && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+- && neon_immediate_valid_for_move (x, mode, NULL, NULL))
+- *total = COSTS_N_INSNS (1);
+- else
+- *total = COSTS_N_INSNS (4);
+- return true;
+-
+- default:
+- *total = COSTS_N_INSNS (4);
+- return false;
+- }
+-}
+-
+-/* Estimates the size cost of thumb1 instructions.
+- For now most of the code is copied from thumb1_rtx_costs. We need more
+- fine grain tuning when we have more related test cases. */
+-static inline int
+-thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+-{
+- machine_mode mode = GET_MODE (x);
+- int words;
+-
+- switch (code)
+- {
+- case ASHIFT:
+- case ASHIFTRT:
+- case LSHIFTRT:
+- case ROTATERT:
+- return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
+-
+- case PLUS:
+- case MINUS:
+- /* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
+- defined by RTL expansion, especially for the expansion of
+- multiplication. */
+- if ((GET_CODE (XEXP (x, 0)) == MULT
+- && power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
+- || (GET_CODE (XEXP (x, 1)) == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
+- return COSTS_N_INSNS (2);
+- /* On purpose fall through for normal RTX. */
+- case COMPARE:
+- case NEG:
+- case NOT:
+- return COSTS_N_INSNS (1);
+-
+- case MULT:
+- if (CONST_INT_P (XEXP (x, 1)))
+- {
+- /* Thumb1 mul instruction can't operate on const. We must Load it
+- into a register first. */
+- int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
+- /* For the targets which have a very small and high-latency multiply
+- unit, we prefer to synthesize the mult with up to 5 instructions,
+- giving a good balance between size and performance. */
+- if (arm_arch6m && arm_m_profile_small_mul)
+- return COSTS_N_INSNS (5);
+- else
+- return COSTS_N_INSNS (1) + const_size;
+- }
+- return COSTS_N_INSNS (1);
+-
+- case SET:
+- /* A SET doesn't have a mode, so let's look at the SET_DEST to get
+- the mode. */
+- words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
+- return COSTS_N_INSNS (words)
+- + COSTS_N_INSNS (1) * (satisfies_constraint_J (SET_SRC (x))
+- || satisfies_constraint_K (SET_SRC (x))
+- /* thumb1_movdi_insn. */
+- || ((words > 1) && MEM_P (SET_SRC (x))));
+-
+- case CONST_INT:
+- if (outer == SET)
+- {
+- if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
+- return COSTS_N_INSNS (1);
+- /* See split "TARGET_THUMB1 && satisfies_constraint_J". */
+- if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
+- return COSTS_N_INSNS (2);
+- /* See split "TARGET_THUMB1 && satisfies_constraint_K". */
+- if (thumb_shiftable_const (INTVAL (x)))
+- return COSTS_N_INSNS (2);
+- return COSTS_N_INSNS (3);
+- }
+- else if ((outer == PLUS || outer == COMPARE)
+- && INTVAL (x) < 256 && INTVAL (x) > -256)
+- return 0;
+- else if ((outer == IOR || outer == XOR || outer == AND)
+- && INTVAL (x) < 256 && INTVAL (x) >= -256)
+- return COSTS_N_INSNS (1);
+- else if (outer == AND)
+- {
+- int i;
+- /* This duplicates the tests in the andsi3 expander. */
+- for (i = 9; i <= 31; i++)
+- if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
+- || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
+- return COSTS_N_INSNS (2);
+- }
+- else if (outer == ASHIFT || outer == ASHIFTRT
+- || outer == LSHIFTRT)
+- return 0;
+- return COSTS_N_INSNS (2);
+-
+- case CONST:
+- case CONST_DOUBLE:
+- case LABEL_REF:
+- case SYMBOL_REF:
+- return COSTS_N_INSNS (3);
+-
+- case UDIV:
+- case UMOD:
+- case DIV:
+- case MOD:
+- return 100;
+-
+- case TRUNCATE:
+- return 99;
+-
+- case AND:
+- case XOR:
+- case IOR:
+- return COSTS_N_INSNS (1);
+-
+- case MEM:
+- return (COSTS_N_INSNS (1)
+- + COSTS_N_INSNS (1)
+- * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
+- + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
+- ? COSTS_N_INSNS (1) : 0));
+-
+- case IF_THEN_ELSE:
+- /* XXX a guess. */
+- if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
+- return 14;
+- return 2;
+-
+- case ZERO_EXTEND:
+- /* XXX still guessing. */
+- switch (GET_MODE (XEXP (x, 0)))
+- {
+- case QImode:
+- return (1 + (mode == DImode ? 4 : 0)
+- + (MEM_P (XEXP (x, 0)) ? 10 : 0));
+-
+- case HImode:
+- return (4 + (mode == DImode ? 4 : 0)
+- + (MEM_P (XEXP (x, 0)) ? 10 : 0));
+-
+- case SImode:
+- return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
+-
+- default:
+- return 99;
+- }
+-
+- default:
+- return 99;
+- }
+-}
+-
+-/* RTX costs when optimizing for size. */
+-static bool
+-arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+- int *total)
+-{
+- machine_mode mode = GET_MODE (x);
+- if (TARGET_THUMB1)
+- {
+- *total = thumb1_size_rtx_costs (x, code, outer_code);
+- return true;
+- }
+-
+- /* FIXME: This makes no attempt to prefer narrow Thumb-2 instructions. */
+- switch (code)
+- {
+- case MEM:
+- /* A memory access costs 1 insn if the mode is small, or the address is
+- a single register, otherwise it costs one insn per word. */
+- if (REG_P (XEXP (x, 0)))
+- *total = COSTS_N_INSNS (1);
+- else if (flag_pic
+- && GET_CODE (XEXP (x, 0)) == PLUS
+- && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
+- /* This will be split into two instructions.
+- See arm.md:calculate_pic_address. */
+- *total = COSTS_N_INSNS (2);
+- else
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- return true;
+-
+- case DIV:
+- case MOD:
+- case UDIV:
+- case UMOD:
+- /* Needs a libcall, so it costs about this. */
+- *total = COSTS_N_INSNS (2);
+- return false;
+-
+- case ROTATE:
+- if (mode == SImode && REG_P (XEXP (x, 1)))
+- {
+- *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, false);
+- return true;
+- }
+- /* Fall through */
+- case ROTATERT:
+- case ASHIFT:
+- case LSHIFTRT:
+- case ASHIFTRT:
+- if (mode == DImode && CONST_INT_P (XEXP (x, 1)))
+- {
+- *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, false);
+- return true;
+- }
+- else if (mode == SImode)
+- {
+- *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
+- 0, false);
+- /* Slightly disparage register shifts, but not by much. */
+- if (!CONST_INT_P (XEXP (x, 1)))
+- *total += 1 + rtx_cost (XEXP (x, 1), mode, code, 1, false);
+- return true;
+- }
+-
+- /* Needs a libcall. */
+- *total = COSTS_N_INSNS (2);
+- return false;
+-
+- case MINUS:
+- if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+- && (mode == SFmode || !TARGET_VFP_SINGLE))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+-
+- if (mode == SImode)
+- {
+- enum rtx_code subcode0 = GET_CODE (XEXP (x, 0));
+- enum rtx_code subcode1 = GET_CODE (XEXP (x, 1));
+-
+- if (subcode0 == ROTATE || subcode0 == ROTATERT || subcode0 == ASHIFT
+- || subcode0 == LSHIFTRT || subcode0 == ASHIFTRT
+- || subcode1 == ROTATE || subcode1 == ROTATERT
+- || subcode1 == ASHIFT || subcode1 == LSHIFTRT
+- || subcode1 == ASHIFTRT)
+- {
+- /* It's just the cost of the two operands. */
+- *total = 0;
+- return false;
+- }
+-
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+-
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- return false;
+-
+- case PLUS:
+- if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+- && (mode == SFmode || !TARGET_VFP_SINGLE))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+-
+- /* A shift as a part of ADD costs nothing. */
+- if (GET_CODE (XEXP (x, 0)) == MULT
+- && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
+- {
+- *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1);
+- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, code, 0, false);
+- *total += rtx_cost (XEXP (x, 1), mode, code, 1, false);
+- return true;
+- }
+-
+- /* Fall through */
+- case AND: case XOR: case IOR:
+- if (mode == SImode)
+- {
+- enum rtx_code subcode = GET_CODE (XEXP (x, 0));
+-
+- if (subcode == ROTATE || subcode == ROTATERT || subcode == ASHIFT
+- || subcode == LSHIFTRT || subcode == ASHIFTRT
+- || (code == AND && subcode == NOT))
+- {
+- /* It's just the cost of the two operands. */
+- *total = 0;
+- return false;
+- }
+- }
+-
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- return false;
+-
+- case MULT:
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- return false;
+-
+- case NEG:
+- if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+- && (mode == SFmode || !TARGET_VFP_SINGLE))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+-
+- /* Fall through */
+- case NOT:
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+-
+- return false;
+-
+- case IF_THEN_ELSE:
+- *total = 0;
+- return false;
+-
+- case COMPARE:
+- if (cc_register (XEXP (x, 0), VOIDmode))
+- * total = 0;
+- else
+- *total = COSTS_N_INSNS (1);
+- return false;
++ case UDIV:
++ case UMOD:
++ case DIV:
++ case MOD:
++ return 100;
+
+- case ABS:
+- if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+- && (mode == SFmode || !TARGET_VFP_SINGLE))
+- *total = COSTS_N_INSNS (1);
+- else
+- *total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode));
+- return false;
++ case TRUNCATE:
++ return 99;
+
+- case SIGN_EXTEND:
+- case ZERO_EXTEND:
+- return arm_rtx_costs_1 (x, outer_code, total, 0);
++ case AND:
++ case XOR:
++ case IOR:
++ return COSTS_N_INSNS (1);
+
+- case CONST_INT:
+- if (const_ok_for_arm (INTVAL (x)))
+- /* A multiplication by a constant requires another instruction
+- to load the constant to a register. */
+- *total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT)
+- ? 1 : 0);
+- else if (const_ok_for_arm (~INTVAL (x)))
+- *total = COSTS_N_INSNS (outer_code == AND ? 0 : 1);
+- else if (const_ok_for_arm (-INTVAL (x)))
+- {
+- if (outer_code == COMPARE || outer_code == PLUS
+- || outer_code == MINUS)
+- *total = 0;
+- else
+- *total = COSTS_N_INSNS (1);
+- }
+- else
+- *total = COSTS_N_INSNS (2);
+- return true;
++ case MEM:
++ return (COSTS_N_INSNS (1)
++ + COSTS_N_INSNS (1)
++ * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
++ + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
++ ? COSTS_N_INSNS (1) : 0));
+
+- case CONST:
+- case LABEL_REF:
+- case SYMBOL_REF:
+- *total = COSTS_N_INSNS (2);
+- return true;
++ case IF_THEN_ELSE:
++ /* XXX a guess. */
++ if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
++ return 14;
++ return 2;
+
+- case CONST_DOUBLE:
+- *total = COSTS_N_INSNS (4);
+- return true;
++ case ZERO_EXTEND:
++ /* XXX still guessing. */
++ switch (GET_MODE (XEXP (x, 0)))
++ {
++ case QImode:
++ return (1 + (mode == DImode ? 4 : 0)
++ + (MEM_P (XEXP (x, 0)) ? 10 : 0));
+
+- case CONST_VECTOR:
+- if (TARGET_NEON
+- && TARGET_HARD_FLOAT
+- && outer_code == SET
+- && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+- && neon_immediate_valid_for_move (x, mode, NULL, NULL))
+- *total = COSTS_N_INSNS (1);
+- else
+- *total = COSTS_N_INSNS (4);
+- return true;
++ case HImode:
++ return (4 + (mode == DImode ? 4 : 0)
++ + (MEM_P (XEXP (x, 0)) ? 10 : 0));
+
+- case HIGH:
+- case LO_SUM:
+- /* We prefer constant pool entries to MOVW/MOVT pairs, so bump the
+- cost of these slightly. */
+- *total = COSTS_N_INSNS (1) + 1;
+- return true;
++ case SImode:
++ return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
+
+- case SET:
+- return false;
++ default:
++ return 99;
++ }
+
+ default:
+- if (mode != VOIDmode)
+- *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- else
+- *total = COSTS_N_INSNS (4); /* How knows? */
+- return false;
++ return 99;
+ }
+ }
+
+@@ -9519,7 +8950,7 @@ arm_unspec_cost (rtx x, enum rtx_code /* outer_code */, bool speed_p, int *cost)
+ flags are live or not, and thus no realistic way to determine what
+ the size will eventually be. */
+ static bool
+-arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
++arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
+ const struct cpu_cost_table *extra_cost,
+ int *cost, bool speed_p)
+ {
+@@ -10771,8 +10202,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
if ((arm_arch4 || GET_MODE (XEXP (x, 0)) == SImode)
&& MEM_P (XEXP (x, 0)))
{
@@ -53280,7 +54647,443 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (mode == DImode)
*cost += COSTS_N_INSNS (1);
-@@ -12257,7 +12571,7 @@ vfp3_const_double_index (rtx x)
+@@ -11164,390 +10593,70 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+ /* Vector costs? */
+ }
+ *cost = LIBCALL_COST (1);
+- return false;
+-
+- case FLOAT:
+- case UNSIGNED_FLOAT:
+- if (TARGET_HARD_FLOAT)
+- {
+- /* ??? Increase the cost to deal with transferring from CORE
+- -> FP registers? */
+- if (speed_p)
+- *cost += extra_cost->fp[mode == DFmode].fromint;
+- return false;
+- }
+- *cost = LIBCALL_COST (1);
+- return false;
+-
+- case CALL:
+- return true;
+-
+- case ASM_OPERANDS:
+- {
+- /* Just a guess. Guess number of instructions in the asm
+- plus one insn per input. Always a minimum of COSTS_N_INSNS (1)
+- though (see PR60663). */
+- int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
+- int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
+-
+- *cost = COSTS_N_INSNS (asm_length + num_operands);
+- return true;
+- }
+- default:
+- if (mode != VOIDmode)
+- *cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
+- else
+- *cost = COSTS_N_INSNS (4); /* Who knows? */
+- return false;
+- }
+-}
+-
+-#undef HANDLE_NARROW_SHIFT_ARITH
+-
+-/* RTX costs when optimizing for size. */
+-static bool
+-arm_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code,
+- int opno ATTRIBUTE_UNUSED, int *total, bool speed)
+-{
+- bool result;
+- int code = GET_CODE (x);
+-
+- if (TARGET_OLD_RTX_COSTS
+- || (!current_tune->insn_extra_cost && !TARGET_NEW_GENERIC_COSTS))
+- {
+- /* Old way. (Deprecated.) */
+- if (!speed)
+- result = arm_size_rtx_costs (x, (enum rtx_code) code,
+- (enum rtx_code) outer_code, total);
+- else
+- result = current_tune->rtx_costs (x, (enum rtx_code) code,
+- (enum rtx_code) outer_code, total,
+- speed);
+- }
+- else
+- {
+- /* New way. */
+- if (current_tune->insn_extra_cost)
+- result = arm_new_rtx_costs (x, (enum rtx_code) code,
+- (enum rtx_code) outer_code,
+- current_tune->insn_extra_cost,
+- total, speed);
+- /* TARGET_NEW_GENERIC_COSTS && !TARGET_OLD_RTX_COSTS
+- && current_tune->insn_extra_cost != NULL */
+- else
+- result = arm_new_rtx_costs (x, (enum rtx_code) code,
+- (enum rtx_code) outer_code,
+- &generic_extra_costs, total, speed);
+- }
+-
+- if (dump_file && (dump_flags & TDF_DETAILS))
+- {
+- print_rtl_single (dump_file, x);
+- fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
+- *total, result ? "final" : "partial");
+- }
+- return result;
+-}
+-
+-/* RTX costs for cores with a slow MUL implementation. Thumb-2 is not
+- supported on any "slowmul" cores, so it can be ignored. */
+-
+-static bool
+-arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+- int *total, bool speed)
+-{
+- machine_mode mode = GET_MODE (x);
+-
+- if (TARGET_THUMB)
+- {
+- *total = thumb1_rtx_costs (x, code, outer_code);
+- return true;
+- }
+-
+- switch (code)
+- {
+- case MULT:
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT
+- || mode == DImode)
+- {
+- *total = COSTS_N_INSNS (20);
+- return false;
+- }
+-
+- if (CONST_INT_P (XEXP (x, 1)))
+- {
+- unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
+- & (unsigned HOST_WIDE_INT) 0xffffffff);
+- int cost, const_ok = const_ok_for_arm (i);
+- int j, booth_unit_size;
+-
+- /* Tune as appropriate. */
+- cost = const_ok ? 4 : 8;
+- booth_unit_size = 2;
+- for (j = 0; i && j < 32; j += booth_unit_size)
+- {
+- i >>= booth_unit_size;
+- cost++;
+- }
+-
+- *total = COSTS_N_INSNS (cost);
+- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+- return true;
+- }
+-
+- *total = COSTS_N_INSNS (20);
+- return false;
+-
+- default:
+- return arm_rtx_costs_1 (x, outer_code, total, speed);;
+- }
+-}
+-
+-
+-/* RTX cost for cores with a fast multiply unit (M variants). */
+-
+-static bool
+-arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+- int *total, bool speed)
+-{
+- machine_mode mode = GET_MODE (x);
+-
+- if (TARGET_THUMB1)
+- {
+- *total = thumb1_rtx_costs (x, code, outer_code);
+- return true;
+- }
+-
+- /* ??? should thumb2 use different costs? */
+- switch (code)
+- {
+- case MULT:
+- /* There is no point basing this on the tuning, since it is always the
+- fast variant if it exists at all. */
+- if (mode == DImode
+- && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
+- && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
+- || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
+- {
+- *total = COSTS_N_INSNS(2);
+- return false;
+- }
+-
+-
+- if (mode == DImode)
+- {
+- *total = COSTS_N_INSNS (5);
+- return false;
+- }
+-
+- if (CONST_INT_P (XEXP (x, 1)))
+- {
+- unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
+- & (unsigned HOST_WIDE_INT) 0xffffffff);
+- int cost, const_ok = const_ok_for_arm (i);
+- int j, booth_unit_size;
+-
+- /* Tune as appropriate. */
+- cost = const_ok ? 4 : 8;
+- booth_unit_size = 8;
+- for (j = 0; i && j < 32; j += booth_unit_size)
+- {
+- i >>= booth_unit_size;
+- cost++;
+- }
+-
+- *total = COSTS_N_INSNS(cost);
+- return false;
+- }
+-
+- if (mode == SImode)
+- {
+- *total = COSTS_N_INSNS (4);
+- return false;
+- }
+-
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+- {
+- if (TARGET_HARD_FLOAT
+- && (mode == SFmode
+- || (mode == DFmode && !TARGET_VFP_SINGLE)))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+- }
+-
+- /* Requires a lib call */
+- *total = COSTS_N_INSNS (20);
+- return false;
+-
+- default:
+- return arm_rtx_costs_1 (x, outer_code, total, speed);
+- }
+-}
+-
+-
+-/* RTX cost for XScale CPUs. Thumb-2 is not supported on any xscale cores,
+- so it can be ignored. */
+-
+-static bool
+-arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+- int *total, bool speed)
+-{
+- machine_mode mode = GET_MODE (x);
+-
+- if (TARGET_THUMB)
+- {
+- *total = thumb1_rtx_costs (x, code, outer_code);
+- return true;
+- }
+-
+- switch (code)
+- {
+- case COMPARE:
+- if (GET_CODE (XEXP (x, 0)) != MULT)
+- return arm_rtx_costs_1 (x, outer_code, total, speed);
+-
+- /* A COMPARE of a MULT is slow on XScale; the muls instruction
+- will stall until the multiplication is complete. */
+- *total = COSTS_N_INSNS (3);
+- return false;
+-
+- case MULT:
+- /* There is no point basing this on the tuning, since it is always the
+- fast variant if it exists at all. */
+- if (mode == DImode
+- && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
+- && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
+- || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
+- {
+- *total = COSTS_N_INSNS (2);
+- return false;
+- }
+-
+-
+- if (mode == DImode)
+- {
+- *total = COSTS_N_INSNS (5);
+- return false;
+- }
+-
+- if (CONST_INT_P (XEXP (x, 1)))
+- {
+- /* If operand 1 is a constant we can more accurately
+- calculate the cost of the multiply. The multiplier can
+- retire 15 bits on the first cycle and a further 12 on the
+- second. We do, of course, have to load the constant into
+- a register first. */
+- unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1));
+- /* There's a general overhead of one cycle. */
+- int cost = 1;
+- unsigned HOST_WIDE_INT masked_const;
+-
+- if (i & 0x80000000)
+- i = ~i;
+-
+- i &= (unsigned HOST_WIDE_INT) 0xffffffff;
+-
+- masked_const = i & 0xffff8000;
+- if (masked_const != 0)
+- {
+- cost++;
+- masked_const = i & 0xf8000000;
+- if (masked_const != 0)
+- cost++;
+- }
+- *total = COSTS_N_INSNS (cost);
+- return false;
+- }
++ return false;
+
+- if (mode == SImode)
++ case FLOAT:
++ case UNSIGNED_FLOAT:
++ if (TARGET_HARD_FLOAT)
+ {
+- *total = COSTS_N_INSNS (3);
++ /* ??? Increase the cost to deal with transferring from CORE
++ -> FP registers? */
++ if (speed_p)
++ *cost += extra_cost->fp[mode == DFmode].fromint;
+ return false;
+ }
+-
+- /* Requires a lib call */
+- *total = COSTS_N_INSNS (20);
++ *cost = LIBCALL_COST (1);
+ return false;
+
++ case CALL:
++ return true;
++
++ case ASM_OPERANDS:
++ {
++ /* Just a guess. Guess number of instructions in the asm
++ plus one insn per input. Always a minimum of COSTS_N_INSNS (1)
++ though (see PR60663). */
++ int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
++ int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
++
++ *cost = COSTS_N_INSNS (asm_length + num_operands);
++ return true;
++ }
+ default:
+- return arm_rtx_costs_1 (x, outer_code, total, speed);
++ if (mode != VOIDmode)
++ *cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
++ else
++ *cost = COSTS_N_INSNS (4); /* Who knows? */
++ return false;
+ }
+ }
+
++#undef HANDLE_NARROW_SHIFT_ARITH
+
+-/* RTX costs for 9e (and later) cores. */
++/* RTX costs entry point. */
+
+ static bool
+-arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+- int *total, bool speed)
++arm_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code,
++ int opno ATTRIBUTE_UNUSED, int *total, bool speed)
+ {
+- machine_mode mode = GET_MODE (x);
+-
+- if (TARGET_THUMB1)
+- {
+- switch (code)
+- {
+- case MULT:
+- /* Small multiply: 32 cycles for an integer multiply inst. */
+- if (arm_arch6m && arm_m_profile_small_mul)
+- *total = COSTS_N_INSNS (32);
+- else
+- *total = COSTS_N_INSNS (3);
+- return true;
++ bool result;
++ int code = GET_CODE (x);
++ gcc_assert (current_tune->insn_extra_cost);
+
+- default:
+- *total = thumb1_rtx_costs (x, code, outer_code);
+- return true;
+- }
+- }
++ result = arm_rtx_costs_internal (x, (enum rtx_code) code,
++ (enum rtx_code) outer_code,
++ current_tune->insn_extra_cost,
++ total, speed);
+
+- switch (code)
++ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+- case MULT:
+- /* There is no point basing this on the tuning, since it is always the
+- fast variant if it exists at all. */
+- if (mode == DImode
+- && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
+- && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
+- || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
+- {
+- *total = COSTS_N_INSNS (2);
+- return false;
+- }
+-
+-
+- if (mode == DImode)
+- {
+- *total = COSTS_N_INSNS (5);
+- return false;
+- }
+-
+- if (mode == SImode)
+- {
+- *total = COSTS_N_INSNS (2);
+- return false;
+- }
+-
+- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+- {
+- if (TARGET_HARD_FLOAT
+- && (mode == SFmode
+- || (mode == DFmode && !TARGET_VFP_SINGLE)))
+- {
+- *total = COSTS_N_INSNS (1);
+- return false;
+- }
+- }
+-
+- *total = COSTS_N_INSNS (20);
+- return false;
+-
+- default:
+- return arm_rtx_costs_1 (x, outer_code, total, speed);
++ print_rtl_single (dump_file, x);
++ fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
++ *total, result ? "final" : "partial");
+ }
++ return result;
+ }
++
+ /* All address computations that can be done are free, but rtx cost returns
+ the same for practically all of them. So we weight the different types
+ of address here in the order (most pref first):
+@@ -12269,7 +11378,7 @@ vfp3_const_double_index (rtx x)
/* We can permit four significant bits of mantissa only, plus a high bit
which is always 1. */
@@ -53289,7 +55092,20 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if ((mantissa & mask) != 0)
return -1;
-@@ -13139,7 +13453,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
+@@ -12423,6 +11532,12 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
+ return 18;
+ }
+
++ /* The tricks done in the code below apply for little-endian vector layout.
++ For big-endian vectors only allow vectors of the form { a, a, a..., a }.
++ FIXME: Implement logic for big-endian vectors. */
++ if (BYTES_BIG_ENDIAN && vector && !const_vec_duplicate_p (op))
++ return -1;
++
+ /* Splat vector constant out into a byte vector. */
+ for (i = 0; i < n_elts; i++)
+ {
+@@ -13151,7 +12266,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
{
if (mode == HFmode)
{
@@ -53298,7 +55114,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return GENERAL_REGS;
if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2, true))
return NO_REGS;
-@@ -15976,14 +16290,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
+@@ -15988,14 +15103,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
/* If the same input register is used in both stores
when storing different constants, try to find a free register.
For example, the code
@@ -53323,7 +55139,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (const_store
&& REGNO (operands[0]) == REGNO (operands[1])
&& INTVAL (operands[4]) != INTVAL (operands[5]))
-@@ -16002,7 +16319,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
+@@ -16014,7 +15132,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
}
else if (TARGET_ARM)
{
@@ -53331,7 +55147,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
int regno = REGNO (operands[0]);
if (!peep2_reg_dead_p (4, operands[0]))
{
-@@ -16356,7 +16672,7 @@ get_jump_table_size (rtx_jump_table_data *insn)
+@@ -16368,7 +15485,7 @@ get_jump_table_size (rtx_jump_table_data *insn)
{
case 1:
/* Round up size of TBB table to a halfword boundary. */
@@ -53340,12 +55156,53 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
break;
case 2:
/* No padding necessary for TBH. */
-@@ -17257,97 +17573,561 @@ note_invalid_constants (rtx_insn *insn, HOST_WIDE_INT address, int do_pushes)
+@@ -16837,35 +15954,37 @@ dump_minipool (rtx_insn *scan)
+ fputc ('\n', dump_file);
+ }
+
++ rtx val = copy_rtx (mp->value);
++
+ switch (GET_MODE_SIZE (mp->mode))
+ {
+ #ifdef HAVE_consttable_1
+ case 1:
+- scan = emit_insn_after (gen_consttable_1 (mp->value), scan);
++ scan = emit_insn_after (gen_consttable_1 (val), scan);
+ break;
+
+ #endif
+ #ifdef HAVE_consttable_2
+ case 2:
+- scan = emit_insn_after (gen_consttable_2 (mp->value), scan);
++ scan = emit_insn_after (gen_consttable_2 (val), scan);
+ break;
+
+ #endif
+ #ifdef HAVE_consttable_4
+ case 4:
+- scan = emit_insn_after (gen_consttable_4 (mp->value), scan);
++ scan = emit_insn_after (gen_consttable_4 (val), scan);
+ break;
+
+ #endif
+ #ifdef HAVE_consttable_8
+ case 8:
+- scan = emit_insn_after (gen_consttable_8 (mp->value), scan);
++ scan = emit_insn_after (gen_consttable_8 (val), scan);
+ break;
+
+ #endif
+ #ifdef HAVE_consttable_16
+ case 16:
+- scan = emit_insn_after (gen_consttable_16 (mp->value), scan);
++ scan = emit_insn_after (gen_consttable_16 (val), scan);
+ break;
+
+ #endif
+@@ -17269,6 +16388,470 @@ note_invalid_constants (rtx_insn *insn, HOST_WIDE_INT address, int do_pushes)
return;
}
--/* Rewrite move insn into subtract of 0 if the condition codes will
-- be useful in next conditional jump insn. */
+/* This function computes the clear mask and PADDING_BITS_TO_CLEAR for structs
+ and unions in the context of ARMv8-M Security Extensions. It is used as a
+ helper function for both 'cmse_nonsecure_call' and 'cmse_nonsecure_entry'
@@ -53362,31 +55219,17 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+comp_not_to_clear_mask_str_un (tree arg_type, int * regno,
+ uint32_t * padding_bits_to_clear,
+ unsigned starting_bit, int * last_used_bit)
-
--static void
--thumb1_reorg (void)
- {
-- basic_block bb;
++
++{
+ unsigned HOST_WIDE_INT not_to_clear_reg_mask = 0;
-
-- FOR_EACH_BB_FN (bb, cfun)
++
+ if (TREE_CODE (arg_type) == RECORD_TYPE)
- {
-- rtx dest, src;
-- rtx cmp, op0, op1, set = NULL;
-- rtx_insn *prev, *insn = BB_END (bb);
-- bool insn_clobbered = false;
++ {
+ unsigned current_bit = starting_bit;
+ tree field;
+ long int offset, size;
-
-- while (insn != BB_HEAD (bb) && !NONDEBUG_INSN_P (insn))
-- insn = PREV_INSN (insn);
-
-- /* Find the last cbranchsi4_insn in basic block BB. */
-- if (insn == BB_HEAD (bb)
-- || INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
-- continue;
++
++
+ field = TYPE_FIELDS (arg_type);
+ while (field)
+ {
@@ -53396,18 +55239,11 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ offset = starting_bit;
+ offset += TREE_INT_CST_ELT (DECL_FIELD_BIT_OFFSET (field), 0);
+ offset %= 32;
-
-- /* Get the register with which we are comparing. */
-- cmp = XEXP (SET_SRC (PATTERN (insn)), 0);
-- op0 = XEXP (cmp, 0);
-- op1 = XEXP (cmp, 1);
++
+ /* This is the actual size of the field, for bitfields this is the
+ bitfield width and not the container size. */
+ size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
-
-- /* Check that comparison is against ZERO. */
-- if (!CONST_INT_P (op1) || INTVAL (op1) != 0)
-- continue;
++
+ if (*last_used_bit != offset)
+ {
+ if (offset < *last_used_bit)
@@ -53429,31 +55265,14 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ /* Otherwise we pad the bits between the last field's end and
+ the start of the new field. */
+ uint32_t mask;
-
-- /* Find the first flag setting insn before INSN in basic block BB. */
-- gcc_assert (insn != BB_HEAD (bb));
-- for (prev = PREV_INSN (insn);
-- (!insn_clobbered
-- && prev != BB_HEAD (bb)
-- && (NOTE_P (prev)
-- || DEBUG_INSN_P (prev)
-- || ((set = single_set (prev)) != NULL
-- && get_attr_conds (prev) == CONDS_NOCOND)));
-- prev = PREV_INSN (prev))
-- {
-- if (reg_set_p (op0, prev))
-- insn_clobbered = true;
-- }
++
+ mask = ((uint32_t)-1) >> (32 - offset);
+ mask -= ((uint32_t) 1 << *last_used_bit) - 1;
+ padding_bits_to_clear[*regno] |= mask;
+ }
+ current_bit = offset;
+ }
-
-- /* Skip if op0 is clobbered by insn other than prev. */
-- if (insn_clobbered)
-- continue;
++
+ /* Calculate further padding bits for inner structs/unions too. */
+ if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
+ {
@@ -53478,9 +55297,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ }
+ *last_used_bit = current_bit;
+ }
-
-- if (!set)
-- continue;
++
+ field = TREE_CHAIN (field);
+ }
+ not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
@@ -53520,31 +55337,14 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ regno_t = (field_size / 32) + *regno;
+ last_used_bit_t = (starting_bit + field_size) % 32;
+ }
-
-- dest = SET_DEST (set);
-- src = SET_SRC (set);
-- if (!low_register_operand (dest, SImode)
-- || !low_register_operand (src, SImode))
-- continue;
++
+ for (i = *regno; i < regno_t; i++)
+ {
+ /* For all but the last register used by this field only keep the
+ padding bits that were padding bits in this field. */
+ padding_bits_to_clear_res[i] &= padding_bits_to_clear_t[i];
+ }
-
-- /* Rewrite move into subtract of 0 if its operand is compared with ZERO
-- in INSN. Both src and dest of the move insn are checked. */
-- if (REGNO (op0) == REGNO (src) || REGNO (op0) == REGNO (dest))
-- {
-- dest = copy_rtx (dest);
-- src = copy_rtx (src);
-- src = gen_rtx_MINUS (SImode, src, const0_rtx);
-- PATTERN (prev) = gen_rtx_SET (dest, src);
-- INSN_CODE (prev) = -1;
-- /* Set test register in INSN to dest. */
-- XEXP (cmp, 0) = copy_rtx (dest);
-- INSN_CODE (insn) = -1;
++
+ /* For the last register, keep all padding bits that were padding
+ bits in this field and any padding bits that are still valid
+ as padding bits but fall outside of this field's size. */
@@ -53563,7 +55363,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ max_bit = last_used_bit_t;
+
+ field = TREE_CHAIN (field);
- }
++ }
+
+ /* Update the current padding_bits_to_clear using the intersection of the
+ padding bits of all the fields. */
@@ -53578,16 +55378,14 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ *regno = max_reg;
+ *last_used_bit = max_bit;
- }
++ }
+ else
+ /* This function should only be used for structs and unions. */
+ gcc_unreachable ();
+
+ return not_to_clear_reg_mask;
- }
-
--/* Convert instructions to their cc-clobbering variant if possible, since
-- that allows us to use smaller encodings. */
++}
++
+/* In the context of ARMv8-M Security Extensions, this function is used for both
+ 'cmse_nonsecure_call' and 'cmse_nonsecure_entry' functions to compute what
+ registers are used when returning or passing arguments, which is then
@@ -53602,27 +55400,18 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+static unsigned HOST_WIDE_INT
+compute_not_to_clear_mask (tree arg_type, rtx arg_rtx, int regno,
+ uint32_t * padding_bits_to_clear)
-
--static void
--thumb2_reorg (void)
- {
-- basic_block bb;
-- regset_head live;
++
++{
+ int last_used_bit = 0;
+ unsigned HOST_WIDE_INT not_to_clear_mask;
-
-- INIT_REG_SET (&live);
++
+ if (RECORD_OR_UNION_TYPE_P (arg_type))
+ {
+ not_to_clear_mask
+ = comp_not_to_clear_mask_str_un (arg_type, ®no,
+ padding_bits_to_clear, 0,
+ &last_used_bit);
-
-- /* We are freeing block_for_insn in the toplev to keep compatibility
-- with old MDEP_REORGS that are not CFG based. Recompute it now. */
-- compute_bb_for_insn ();
-- df_analyze ();
++
+
+ /* If the 'last_used_bit' is not zero, that means we are still using a
+ part of the last 'regno'. In such cases we must clear the trailing
@@ -53878,101 +55667,10 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ }
+}
+
-+/* Rewrite move insn into subtract of 0 if the condition codes will
-+ be useful in next conditional jump insn. */
-+
-+static void
-+thumb1_reorg (void)
-+{
-+ basic_block bb;
-+
-+ FOR_EACH_BB_FN (bb, cfun)
-+ {
-+ rtx dest, src;
-+ rtx cmp, op0, op1, set = NULL;
-+ rtx_insn *prev, *insn = BB_END (bb);
-+ bool insn_clobbered = false;
-+
-+ while (insn != BB_HEAD (bb) && !NONDEBUG_INSN_P (insn))
-+ insn = PREV_INSN (insn);
-+
-+ /* Find the last cbranchsi4_insn in basic block BB. */
-+ if (insn == BB_HEAD (bb)
-+ || INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
-+ continue;
-+
-+ /* Get the register with which we are comparing. */
-+ cmp = XEXP (SET_SRC (PATTERN (insn)), 0);
-+ op0 = XEXP (cmp, 0);
-+ op1 = XEXP (cmp, 1);
-+
-+ /* Check that comparison is against ZERO. */
-+ if (!CONST_INT_P (op1) || INTVAL (op1) != 0)
-+ continue;
-+
-+ /* Find the first flag setting insn before INSN in basic block BB. */
-+ gcc_assert (insn != BB_HEAD (bb));
-+ for (prev = PREV_INSN (insn);
-+ (!insn_clobbered
-+ && prev != BB_HEAD (bb)
-+ && (NOTE_P (prev)
-+ || DEBUG_INSN_P (prev)
-+ || ((set = single_set (prev)) != NULL
-+ && get_attr_conds (prev) == CONDS_NOCOND)));
-+ prev = PREV_INSN (prev))
-+ {
-+ if (reg_set_p (op0, prev))
-+ insn_clobbered = true;
-+ }
-+
-+ /* Skip if op0 is clobbered by insn other than prev. */
-+ if (insn_clobbered)
-+ continue;
-+
-+ if (!set)
-+ continue;
-+
-+ dest = SET_DEST (set);
-+ src = SET_SRC (set);
-+ if (!low_register_operand (dest, SImode)
-+ || !low_register_operand (src, SImode))
-+ continue;
-+
-+ /* Rewrite move into subtract of 0 if its operand is compared with ZERO
-+ in INSN. Both src and dest of the move insn are checked. */
-+ if (REGNO (op0) == REGNO (src) || REGNO (op0) == REGNO (dest))
-+ {
-+ dest = copy_rtx (dest);
-+ src = copy_rtx (src);
-+ src = gen_rtx_MINUS (SImode, src, const0_rtx);
-+ PATTERN (prev) = gen_rtx_SET (dest, src);
-+ INSN_CODE (prev) = -1;
-+ /* Set test register in INSN to dest. */
-+ XEXP (cmp, 0) = copy_rtx (dest);
-+ INSN_CODE (insn) = -1;
-+ }
-+ }
-+}
-+
-+/* Convert instructions to their cc-clobbering variant if possible, since
-+ that allows us to use smaller encodings. */
-+
-+static void
-+thumb2_reorg (void)
-+{
-+ basic_block bb;
-+ regset_head live;
-+
-+ INIT_REG_SET (&live);
-+
-+ /* We are freeing block_for_insn in the toplev to keep compatibility
-+ with old MDEP_REORGS that are not CFG based. Recompute it now. */
-+ compute_bb_for_insn ();
-+ df_analyze ();
-
- enum Convert_Action {SKIP, CONV, SWAP_CONV};
+ /* Rewrite move insn into subtract of 0 if the condition codes will
+ be useful in next conditional jump insn. */
-@@ -17557,6 +18337,8 @@ arm_reorg (void)
+@@ -17569,6 +17152,8 @@ arm_reorg (void)
HOST_WIDE_INT address = 0;
Mfix * fix;
@@ -53981,7 +55679,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (TARGET_THUMB1)
thumb1_reorg ();
else if (TARGET_THUMB2)
-@@ -17929,6 +18711,23 @@ vfp_emit_fstmd (int base_reg, int count)
+@@ -17941,6 +17526,23 @@ vfp_emit_fstmd (int base_reg, int count)
return count * 8;
}
@@ -54005,7 +55703,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Emit a call instruction with pattern PAT. ADDR is the address of
the call target. */
-@@ -18588,6 +19387,8 @@ output_move_vfp (rtx *operands)
+@@ -18600,6 +18202,8 @@ output_move_vfp (rtx *operands)
rtx reg, mem, addr, ops[2];
int load = REG_P (operands[0]);
int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
@@ -54014,7 +55712,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT;
const char *templ;
char buff[50];
-@@ -18600,8 +19401,10 @@ output_move_vfp (rtx *operands)
+@@ -18612,8 +18216,10 @@ output_move_vfp (rtx *operands)
gcc_assert (REG_P (reg));
gcc_assert (IS_VFP_REGNUM (REGNO (reg)));
@@ -54026,7 +55724,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
|| mode == SImode
|| mode == DImode
|| (TARGET_NEON && VALID_NEON_DREG_MODE (mode)));
-@@ -18632,7 +19435,7 @@ output_move_vfp (rtx *operands)
+@@ -18644,7 +18250,7 @@ output_move_vfp (rtx *operands)
sprintf (buff, templ,
load ? "ld" : "st",
@@ -54035,7 +55733,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
dp ? "P" : "",
integer_p ? "\t%@ int" : "");
output_asm_insn (buff, ops);
-@@ -19058,7 +19861,8 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
+@@ -19070,7 +18676,8 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
return NULL;
}
@@ -54045,7 +55743,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return ARM_LSL_NAME;
default:
-@@ -19090,22 +19894,6 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
+@@ -19102,22 +18709,6 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
return mnem;
}
@@ -54068,7 +55766,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Output a .ascii pseudo-op, keeping track of lengths. This is
because /bin/as is horribly restrictive. The judgement about
whether or not each character is 'printable' (and can be output as
-@@ -19462,7 +20250,7 @@ arm_get_vfp_saved_size (void)
+@@ -19474,7 +19065,7 @@ arm_get_vfp_saved_size (void)
saved = 0;
/* Space for saved VFP registers. */
@@ -54077,7 +55775,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
count = 0;
for (regno = FIRST_VFP_REGNUM;
-@@ -19551,6 +20339,7 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
+@@ -19563,6 +19154,7 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
(e.g. interworking) then we can load the return address
directly into the PC. Otherwise we must load it into LR. */
if (really_return
@@ -54085,7 +55783,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& (IS_INTERRUPT (func_type) || !TARGET_INTERWORK))
return_reg = reg_names[PC_REGNUM];
else
-@@ -19691,8 +20480,45 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
+@@ -19703,18 +19295,93 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
break;
default:
@@ -54132,10 +55830,16 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
sprintf (instr, "bx%s\t%%|lr", conditional);
else
sprintf (instr, "mov%s\t%%|pc, %%|lr", conditional);
-@@ -19705,6 +20531,44 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
- return "";
- }
+ break;
+ }
+- output_asm_insn (instr, & operand);
++ output_asm_insn (instr, & operand);
++ }
++
++ return "";
++}
++
+/* Output in FILE asm statements needed to declare the NAME of the function
+ defined by its DECL node. */
+
@@ -54161,8 +55865,9 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ ARM_DECLARE_FUNCTION_NAME (file, cmse_name, decl);
+ ASM_OUTPUT_TYPE_DIRECTIVE (file, cmse_name, "function");
-+ }
-+
+ }
+
+- return "";
+ ARM_DECLARE_FUNCTION_NAME (file, name, decl);
+ ASM_OUTPUT_TYPE_DIRECTIVE (file, name, "function");
+ ASM_DECLARE_RESULT (file, DECL_RESULT (decl));
@@ -54172,12 +55877,10 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ ASM_OUTPUT_LABEL (file, cmse_name);
+
+ ARM_OUTPUT_FN_UNWIND (file, TRUE);
-+}
-+
- /* Write the function name into the code section, directly preceding
- the function prologue.
+ }
-@@ -19754,10 +20618,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
+ /* Write the function name into the code section, directly preceding
+@@ -19766,10 +19433,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
{
unsigned long func_type;
@@ -54188,7 +55891,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Sanity check. */
gcc_assert (!arm_ccfsm_state && !arm_target_insn);
-@@ -19792,6 +20652,8 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
+@@ -19804,6 +19467,8 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
asm_fprintf (f, "\t%@ Nested: function declared inside another function.\n");
if (IS_STACKALIGN (func_type))
asm_fprintf (f, "\t%@ Stack Align: May be called with mis-aligned SP.\n");
@@ -54197,7 +55900,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
asm_fprintf (f, "\t%@ args = %d, pretend = %d, frame = %wd\n",
crtl->args.size,
-@@ -20461,7 +21323,7 @@ arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
+@@ -20473,7 +20138,7 @@ arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
REG_NOTES (par) = dwarf;
/* Make sure cfa doesn't leave with IP_REGNUM to allow unwinding fron FP. */
@@ -54206,7 +55909,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
RTX_FRAME_RELATED_P (par) = 1;
add_reg_note (par, REG_CFA_DEF_CFA, hard_frame_pointer_rtx);
-@@ -20922,7 +21784,7 @@ arm_get_frame_offsets (void)
+@@ -20934,7 +20599,7 @@ arm_get_frame_offsets (void)
func_type = arm_current_func_type ();
/* Space for saved VFP registers. */
if (! IS_VOLATILE (func_type)
@@ -54215,7 +55918,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
saved += arm_get_vfp_saved_size ();
}
else /* TARGET_THUMB1 */
-@@ -21143,7 +22005,7 @@ arm_save_coproc_regs(void)
+@@ -21155,7 +20820,7 @@ arm_save_coproc_regs(void)
saved_size += 8;
}
@@ -54224,7 +55927,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
start_reg = FIRST_VFP_REGNUM;
-@@ -22923,6 +23785,8 @@ maybe_get_arm_condition_code (rtx comparison)
+@@ -22941,6 +22606,8 @@ maybe_get_arm_condition_code (rtx comparison)
{
case LTU: return ARM_CS;
case GEU: return ARM_CC;
@@ -54233,7 +55936,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
default: return ARM_NV;
}
-@@ -22948,6 +23812,14 @@ maybe_get_arm_condition_code (rtx comparison)
+@@ -22966,6 +22633,14 @@ maybe_get_arm_condition_code (rtx comparison)
default: return ARM_NV;
}
@@ -54248,7 +55951,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case CCmode:
switch (comp_code)
{
-@@ -23378,7 +24250,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+@@ -23396,7 +23071,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
{
if (GET_MODE_CLASS (mode) == MODE_CC)
return (regno == CC_REGNUM
@@ -54257,7 +55960,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& regno == VFPCC_REGNUM));
if (regno == CC_REGNUM && GET_MODE_CLASS (mode) != MODE_CC)
-@@ -23392,8 +24264,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+@@ -23410,8 +23085,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
start of an even numbered register pair. */
return (ARM_NUM_REGS (mode) < 2) || (regno < LAST_LO_REGNUM);
@@ -54267,7 +55970,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
if (mode == SFmode || mode == SImode)
return VFP_REGNO_OK_FOR_SINGLE (regno);
-@@ -23401,10 +24272,12 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+@@ -23419,10 +23093,12 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
if (mode == DFmode)
return VFP_REGNO_OK_FOR_DOUBLE (regno);
@@ -54283,7 +55986,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (TARGET_NEON)
return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno))
-@@ -23608,26 +24481,6 @@ arm_debugger_arg_offset (int value, rtx addr)
+@@ -23626,26 +23302,6 @@ arm_debugger_arg_offset (int value, rtx addr)
return value;
}
@@ -54310,7 +56013,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Implement TARGET_PROMOTED_TYPE. */
static tree
-@@ -23867,8 +24720,8 @@ thumb_pop (FILE *f, unsigned long mask)
+@@ -23885,8 +23541,8 @@ thumb_pop (FILE *f, unsigned long mask)
if (mask & (1 << PC_REGNUM))
{
/* Catch popping the PC. */
@@ -54321,7 +56024,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
/* The PC is never poped directly, instead
it is popped into r3 and then BX is used. */
-@@ -23929,7 +24782,14 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
+@@ -23947,7 +23603,14 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
if (crtl->calls_eh_return)
asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
@@ -54337,7 +56040,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return;
}
/* Otherwise if we are not supporting interworking and we have not created
-@@ -23938,7 +24798,8 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
+@@ -23956,7 +23619,8 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
else if (!TARGET_INTERWORK
&& !TARGET_BACKTRACE
&& !is_called_in_ARM_mode (current_function_decl)
@@ -54347,7 +56050,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
asm_fprintf (f, "\tpop\t{%r}\n", PC_REGNUM);
return;
-@@ -24161,7 +25022,21 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
+@@ -24179,7 +23843,21 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
/* Return to caller. */
@@ -54370,7 +56073,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
/* Scan INSN just before assembler is output for it.
-@@ -25026,6 +25901,149 @@ thumb1_expand_prologue (void)
+@@ -25044,6 +24722,149 @@ thumb1_expand_prologue (void)
cfun->machine->lr_save_eliminated = 0;
}
@@ -54520,7 +56223,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Generate pattern *pop_multiple_with_stack_update_and_return if single
POP instruction can be generated. LR should be replaced by PC. All
the checks required are already done by USE_RETURN_INSN (). Hence,
-@@ -25047,6 +26065,12 @@ thumb2_expand_return (bool simple_return)
+@@ -25065,6 +24886,12 @@ thumb2_expand_return (bool simple_return)
if (!simple_return && saved_regs_mask)
{
@@ -54533,7 +56236,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (num_regs == 1)
{
rtx par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
-@@ -25069,6 +26093,8 @@ thumb2_expand_return (bool simple_return)
+@@ -25087,6 +24914,8 @@ thumb2_expand_return (bool simple_return)
}
else
{
@@ -54542,7 +56245,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
emit_jump_insn (simple_return_rtx);
}
}
-@@ -25127,6 +26153,10 @@ thumb1_expand_epilogue (void)
+@@ -25145,6 +24974,10 @@ thumb1_expand_epilogue (void)
if (! df_regs_ever_live_p (LR_REGNUM))
emit_use (gen_rtx_REG (SImode, LR_REGNUM));
@@ -54553,7 +56256,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
/* Epilogue code for APCS frame. */
-@@ -25161,7 +26191,7 @@ arm_expand_epilogue_apcs_frame (bool really_return)
+@@ -25179,7 +25012,7 @@ arm_expand_epilogue_apcs_frame (bool really_return)
floats_from_frame += 4;
}
@@ -54562,7 +56265,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
int start_reg;
rtx ip_rtx = gen_rtx_REG (SImode, IP_REGNUM);
-@@ -25407,7 +26437,7 @@ arm_expand_epilogue (bool really_return)
+@@ -25425,7 +25258,7 @@ arm_expand_epilogue (bool really_return)
}
}
@@ -54571,7 +56274,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
/* Generate VFP register multi-pop. */
int end_reg = LAST_VFP_REGNUM + 1;
-@@ -25464,6 +26494,7 @@ arm_expand_epilogue (bool really_return)
+@@ -25482,6 +25315,7 @@ arm_expand_epilogue (bool really_return)
if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED
&& (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL)
@@ -54579,7 +56282,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& !IS_STACKALIGN (func_type)
&& really_return
&& crtl->args.pretend_args_size == 0
-@@ -25560,6 +26591,14 @@ arm_expand_epilogue (bool really_return)
+@@ -25578,6 +25412,14 @@ arm_expand_epilogue (bool really_return)
stack_pointer_rtx, stack_pointer_rtx);
}
@@ -54594,7 +56297,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
if (!really_return)
return;
-@@ -25856,13 +26895,6 @@ thumb_reload_out_hi (rtx *operands)
+@@ -25874,13 +25716,6 @@ thumb_reload_out_hi (rtx *operands)
emit_insn (gen_thumb_movhi_clobber (operands[0], operands[1], operands[2]));
}
@@ -54608,7 +56311,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Return the length of a function name prefix
that starts with the character 'c'. */
static int
-@@ -26000,7 +27032,7 @@ arm_file_start (void)
+@@ -26018,7 +25853,7 @@ arm_file_start (void)
const char* pos = strchr (arm_selected_arch->name, '+');
if (pos)
{
@@ -54617,7 +56320,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
gcc_assert (strlen (arm_selected_arch->name)
<= sizeof (buf) / sizeof (*pos));
strncpy (buf, arm_selected_arch->name,
-@@ -26025,7 +27057,7 @@ arm_file_start (void)
+@@ -26043,7 +25878,7 @@ arm_file_start (void)
if (print_tune_info)
arm_print_tune_info ();
@@ -54626,7 +56329,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
if (TARGET_HARD_FLOAT && TARGET_VFP_SINGLE)
arm_emit_eabi_attribute ("Tag_ABI_HardFP_use", 27, 1);
-@@ -26142,11 +27174,10 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
+@@ -26160,11 +25995,10 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
/* Output code to add DELTA to the first argument, and then jump
to FUNCTION. Used for C++ multiple inheritance. */
@@ -54641,7 +56344,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
static int thunk_label = 0;
char label[256];
-@@ -26287,6 +27318,76 @@ arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+@@ -26305,6 +26139,76 @@ arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
final_end_function ();
}
@@ -54718,7 +56421,16 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
int
arm_emit_vector_const (FILE *file, rtx x)
{
-@@ -27671,7 +28772,7 @@ arm_conditional_register_usage (void)
+@@ -27543,7 +27447,7 @@ arm_mangle_type (const_tree type)
+ static const int thumb_core_reg_alloc_order[] =
+ {
+ 3, 2, 1, 0, 4, 5, 6, 7,
+- 14, 12, 8, 9, 10, 11
++ 12, 14, 8, 9, 10, 11
+ };
+
+ /* Adjust register allocation order when compiling for Thumb. */
+@@ -27689,7 +27593,7 @@ arm_conditional_register_usage (void)
if (TARGET_THUMB1)
fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
@@ -54727,7 +56439,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
/* VFPv3 registers are disabled when earlier VFP
versions are selected due to the definition of
-@@ -27742,7 +28843,7 @@ arm_preferred_rename_class (reg_class_t rclass)
+@@ -27760,7 +27664,7 @@ arm_preferred_rename_class (reg_class_t rclass)
return NO_REGS;
}
@@ -54736,7 +56448,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
So this function MUST be kept in sync with that insn pattern. */
int
arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
-@@ -27759,6 +28860,11 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
+@@ -27777,6 +27681,11 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
/* Thumb2 mode. */
regno = REGNO (first_op);
@@ -54748,7 +56460,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
for (i = 1; i < num_saves && !hi_reg; i++)
{
-@@ -27771,6 +28877,56 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
+@@ -27789,6 +27698,56 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
return 4;
}
@@ -54805,7 +56517,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Compute the number of instructions emitted by output_move_double. */
int
arm_count_output_move_double_insns (rtx *operands)
-@@ -27802,7 +28958,11 @@ vfp3_const_double_for_fract_bits (rtx operand)
+@@ -27820,7 +27779,11 @@ vfp3_const_double_for_fract_bits (rtx operand)
HOST_WIDE_INT value = real_to_integer (&r0);
value = value & 0xffffffff;
if ((value != 0) && ( (value & (value - 1)) == 0))
@@ -54818,7 +56530,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
}
return 0;
-@@ -27942,9 +29102,9 @@ emit_unlikely_jump (rtx insn)
+@@ -27960,9 +27923,9 @@ emit_unlikely_jump (rtx insn)
void
arm_expand_compare_and_swap (rtx operands[])
{
@@ -54830,7 +56542,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
bval = operands[0];
rval = operands[1];
-@@ -28001,43 +29161,54 @@ arm_expand_compare_and_swap (rtx operands[])
+@@ -28019,43 +27982,54 @@ arm_expand_compare_and_swap (rtx operands[])
gcc_unreachable ();
}
@@ -54900,7 +56612,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
mode = GET_MODE (mem);
bool is_armv8_sync = arm_arch8 && is_mm_sync (mod_s);
-@@ -28069,26 +29240,44 @@ arm_split_compare_and_swap (rtx operands[])
+@@ -28087,26 +28061,44 @@ arm_split_compare_and_swap (rtx operands[])
arm_emit_load_exclusive (mode, rval, mem, use_acquire);
@@ -54958,7 +56670,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
if (!is_mm_relaxed (mod_f))
-@@ -28103,6 +29292,15 @@ arm_split_compare_and_swap (rtx operands[])
+@@ -28121,6 +28113,15 @@ arm_split_compare_and_swap (rtx operands[])
emit_label (label2);
}
@@ -54974,7 +56686,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
void
arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
rtx value, rtx model_rtx, rtx cond)
-@@ -28111,6 +29309,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+@@ -28129,6 +28130,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
machine_mode mode = GET_MODE (mem);
machine_mode wmode = (mode == DImode ? DImode : SImode);
rtx_code_label *label;
@@ -54982,7 +56694,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
rtx x;
bool is_armv8_sync = arm_arch8 && is_mm_sync (model);
-@@ -28145,6 +29344,28 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+@@ -28163,6 +28165,28 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
arm_emit_load_exclusive (mode, old_out, mem, use_acquire);
@@ -55011,7 +56723,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
switch (code)
{
case SET:
-@@ -28359,6 +29580,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
+@@ -28377,6 +28401,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
case V8QImode: gen = gen_neon_vuzpv8qi_internal; break;
case V8HImode: gen = gen_neon_vuzpv8hi_internal; break;
case V4HImode: gen = gen_neon_vuzpv4hi_internal; break;
@@ -55020,7 +56732,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SImode: gen = gen_neon_vuzpv4si_internal; break;
case V2SImode: gen = gen_neon_vuzpv2si_internal; break;
case V2SFmode: gen = gen_neon_vuzpv2sf_internal; break;
-@@ -28432,6 +29655,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
+@@ -28450,6 +28476,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
case V8QImode: gen = gen_neon_vzipv8qi_internal; break;
case V8HImode: gen = gen_neon_vzipv8hi_internal; break;
case V4HImode: gen = gen_neon_vzipv4hi_internal; break;
@@ -55029,7 +56741,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SImode: gen = gen_neon_vzipv4si_internal; break;
case V2SImode: gen = gen_neon_vzipv2si_internal; break;
case V2SFmode: gen = gen_neon_vzipv2sf_internal; break;
-@@ -28484,6 +29709,8 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+@@ -28502,6 +28530,8 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
case V8QImode: gen = gen_neon_vrev32v8qi; break;
case V8HImode: gen = gen_neon_vrev64v8hi; break;
case V4HImode: gen = gen_neon_vrev64v4hi; break;
@@ -55038,7 +56750,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
default:
return false;
}
-@@ -28567,6 +29794,8 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+@@ -28585,6 +28615,8 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
case V8QImode: gen = gen_neon_vtrnv8qi_internal; break;
case V8HImode: gen = gen_neon_vtrnv8hi_internal; break;
case V4HImode: gen = gen_neon_vtrnv4hi_internal; break;
@@ -55047,7 +56759,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V4SImode: gen = gen_neon_vtrnv4si_internal; break;
case V2SImode: gen = gen_neon_vtrnv2si_internal; break;
case V2SFmode: gen = gen_neon_vtrnv2sf_internal; break;
-@@ -28642,6 +29871,8 @@ arm_evpc_neon_vext (struct expand_vec_perm_d *d)
+@@ -28660,6 +28692,8 @@ arm_evpc_neon_vext (struct expand_vec_perm_d *d)
case V8HImode: gen = gen_neon_vextv8hi; break;
case V2SImode: gen = gen_neon_vextv2si; break;
case V4SImode: gen = gen_neon_vextv4si; break;
@@ -55056,7 +56768,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
case V2SFmode: gen = gen_neon_vextv2sf; break;
case V4SFmode: gen = gen_neon_vextv4sf; break;
case V2DImode: gen = gen_neon_vextv2di; break;
-@@ -29167,7 +30398,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
+@@ -29185,7 +29219,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
{
enum rtx_code code = GET_CODE (*comparison);
int code_int;
@@ -55065,7 +56777,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
? GET_MODE (*op2) : GET_MODE (*op1);
gcc_assert (GET_MODE (*op1) != VOIDmode || GET_MODE (*op2) != VOIDmode);
-@@ -29195,11 +30426,19 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
+@@ -29213,11 +29247,19 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
*op2 = force_reg (mode, *op2);
return true;
@@ -55087,7 +56799,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
*op2 = force_reg (mode, *op2);
return true;
default:
-@@ -29741,11 +30980,57 @@ arm_macro_fusion_p (void)
+@@ -29759,11 +29801,57 @@ arm_macro_fusion_p (void)
return current_tune->fusible_ops != tune_params::FUSE_NOTHING;
}
@@ -55146,7 +56858,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
rtx prev_set = single_set (prev);
rtx curr_set = single_set (curr);
-@@ -29763,54 +31048,26 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
+@@ -29781,54 +29869,26 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
&& aarch_crypto_can_dual_issue (prev, curr))
return true;
@@ -55212,7 +56924,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
-@@ -29835,9 +31092,9 @@ arm_const_not_ok_for_debug_p (rtx p)
+@@ -29853,9 +29913,9 @@ arm_const_not_ok_for_debug_p (rtx p)
&& GET_CODE (XEXP (p, 0)) == SYMBOL_REF
&& (decl_op0 = SYMBOL_REF_DECL (XEXP (p, 0))))
{
@@ -55224,7 +56936,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
|| TREE_CODE (decl_op0) == CONST_DECL))
return (get_variable_section (decl_op1, false)
!= get_variable_section (decl_op0, false));
-@@ -29970,9 +31227,8 @@ arm_can_inline_p (tree caller, tree callee)
+@@ -29988,9 +30048,8 @@ arm_can_inline_p (tree caller, tree callee)
if ((caller_fpu->features & callee_fpu->features) != callee_fpu->features)
return false;
@@ -55236,7 +56948,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
return false;
/* OK to inline between different modes.
-@@ -30315,4 +31571,113 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
+@@ -30333,4 +30392,113 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
return;
}
@@ -55908,7 +57620,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "addsi3_compare0"
[(set (reg:CC_NOOV CC_REGNUM)
(compare:CC_NOOV
-@@ -866,6 +1059,75 @@
+@@ -866,20 +1059,90 @@
(set_attr "type" "adcs_reg")]
)
@@ -55982,9 +57694,31 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+)
+
(define_insn "*subsi3_carryin"
- [(set (match_operand:SI 0 "s_register_operand" "=r,r")
- (minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I")
-@@ -1895,7 +2157,7 @@
+- [(set (match_operand:SI 0 "s_register_operand" "=r,r")
+- (minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I")
+- (match_operand:SI 2 "s_register_operand" "r,r"))
+- (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
++ [(set (match_operand:SI 0 "s_register_operand" "=r,r,r")
++ (minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I,Pz")
++ (match_operand:SI 2 "s_register_operand" "r,r,r"))
++ (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
+ "TARGET_32BIT"
+ "@
+ sbc%?\\t%0, %1, %2
+- rsc%?\\t%0, %2, %1"
++ rsc%?\\t%0, %2, %1
++ sbc%?\\t%0, %2, %2, lsl #1"
+ [(set_attr "conds" "use")
+- (set_attr "arch" "*,a")
++ (set_attr "arch" "*,a,t2")
+ (set_attr "predicable" "yes")
+ (set_attr "predicable_short_it" "no")
+- (set_attr "type" "adc_reg,adc_imm")]
++ (set_attr "type" "adc_reg,adc_imm,alu_shift_imm")]
+ )
+
+ (define_insn "*subsi3_carryin_const"
+@@ -1895,7 +2158,7 @@
[(set (match_operand:SF 0 "s_register_operand" "")
(div:SF (match_operand:SF 1 "s_register_operand" "")
(match_operand:SF 2 "s_register_operand" "")))]
@@ -55993,7 +57727,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"")
(define_expand "divdf3"
-@@ -2137,13 +2399,13 @@
+@@ -2137,13 +2400,13 @@
for (i = 9; i <= 31; i++)
{
@@ -56009,7 +57743,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
== ~INTVAL (operands[2]))
{
rtx shift = GEN_INT (i);
-@@ -2442,7 +2704,7 @@
+@@ -2442,7 +2705,7 @@
{
int start_bit = INTVAL (operands[2]);
int width = INTVAL (operands[1]);
@@ -56018,7 +57752,23 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
rtx target, subtarget;
if (arm_arch_thumb2)
-@@ -3744,8 +4006,7 @@
+@@ -3050,7 +3313,14 @@
+ (xor:DI (match_operand:DI 1 "s_register_operand" "")
+ (match_operand:DI 2 "arm_xordi_operand" "")))]
+ "TARGET_32BIT"
+- ""
++ {
++ /* The iWMMXt pattern for xordi3 accepts only register operands but we want
++ to reuse this expander for all TARGET_32BIT targets so just force the
++ constants into a register. Unlike for the anddi3 and iordi3 there are
++ no NEON instructions that take an immediate. */
++ if (TARGET_IWMMXT && !REG_P (operands[2]))
++ operands[2] = force_reg (DImode, operands[2]);
++ }
+ )
+
+ (define_insn_and_split "*xordi3_insn"
+@@ -3744,8 +4014,7 @@
{
rtx scratch1, scratch2;
@@ -56028,7 +57778,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
DONE;
-@@ -3790,7 +4051,7 @@
+@@ -3790,7 +4059,7 @@
"TARGET_EITHER"
"
if (CONST_INT_P (operands[2])
@@ -56037,7 +57787,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
emit_insn (gen_movsi (operands[0], const0_rtx));
DONE;
-@@ -3818,8 +4079,7 @@
+@@ -3818,8 +4087,7 @@
{
rtx scratch1, scratch2;
@@ -56047,7 +57797,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
DONE;
-@@ -3864,7 +4124,7 @@
+@@ -3864,7 +4132,7 @@
"TARGET_EITHER"
"
if (CONST_INT_P (operands[2])
@@ -56056,7 +57806,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
operands[2] = GEN_INT (31);
"
)
-@@ -3889,8 +4149,7 @@
+@@ -3889,8 +4157,7 @@
{
rtx scratch1, scratch2;
@@ -56066,7 +57816,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
DONE;
-@@ -3935,7 +4194,7 @@
+@@ -3935,7 +4202,7 @@
"TARGET_EITHER"
"
if (CONST_INT_P (operands[2])
@@ -56075,7 +57825,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
emit_insn (gen_movsi (operands[0], const0_rtx));
DONE;
-@@ -3969,7 +4228,7 @@
+@@ -3969,7 +4236,7 @@
if (TARGET_32BIT)
{
if (CONST_INT_P (operands[2])
@@ -56084,7 +57834,37 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
operands[2] = GEN_INT (INTVAL (operands[2]) % 32);
}
else /* TARGET_THUMB1 */
-@@ -4326,23 +4585,29 @@
+@@ -4300,9 +4567,11 @@
+ (define_insn "*extv_reg"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+- (match_operand:SI 2 "const_int_M_operand" "M")
+- (match_operand:SI 3 "const_int_M_operand" "M")))]
+- "arm_arch_thumb2"
++ (match_operand:SI 2 "const_int_operand" "n")
++ (match_operand:SI 3 "const_int_operand" "n")))]
++ "arm_arch_thumb2
++ && IN_RANGE (INTVAL (operands[3]), 0, 31)
++ && IN_RANGE (INTVAL (operands[2]), 1, 32 - INTVAL (operands[3]))"
+ "sbfx%?\t%0, %1, %3, %2"
+ [(set_attr "length" "4")
+ (set_attr "predicable" "yes")
+@@ -4313,9 +4582,11 @@
+ (define_insn "extzv_t2"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+- (match_operand:SI 2 "const_int_M_operand" "M")
+- (match_operand:SI 3 "const_int_M_operand" "M")))]
+- "arm_arch_thumb2"
++ (match_operand:SI 2 "const_int_operand" "n")
++ (match_operand:SI 3 "const_int_operand" "n")))]
++ "arm_arch_thumb2
++ && IN_RANGE (INTVAL (operands[3]), 0, 31)
++ && IN_RANGE (INTVAL (operands[2]), 1, 32 - INTVAL (operands[3]))"
+ "ubfx%?\t%0, %1, %3, %2"
+ [(set_attr "length" "4")
+ (set_attr "predicable" "yes")
+@@ -4326,23 +4597,29 @@
;; Division instructions
(define_insn "divsi3"
@@ -56124,7 +57904,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(set_attr "predicable_short_it" "no")
(set_attr "type" "udiv")]
)
-@@ -4350,6 +4615,63 @@
+@@ -4350,6 +4627,63 @@
;; Unary arithmetic insns
@@ -56188,7 +57968,24 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_expand "negdi2"
[(parallel
[(set (match_operand:DI 0 "s_register_operand" "")
-@@ -4390,6 +4712,20 @@
+@@ -4367,12 +4701,13 @@
+
+ ;; The constraints here are to prevent a *partial* overlap (where %Q0 == %R1).
+ ;; The first alternative allows the common case of a *full* overlap.
+-(define_insn_and_split "*arm_negdi2"
++(define_insn_and_split "*negdi2_insn"
+ [(set (match_operand:DI 0 "s_register_operand" "=r,&r")
+ (neg:DI (match_operand:DI 1 "s_register_operand" "0,r")))
+ (clobber (reg:CC CC_REGNUM))]
+- "TARGET_ARM"
+- "#" ; "rsbs\\t%Q0, %Q1, #0\;rsc\\t%R0, %R1, #0"
++ "TARGET_32BIT"
++ "#" ; rsbs %Q0, %Q1, #0; rsc %R0, %R1, #0 (ARM)
++ ; negs %Q0, %Q1 ; sbc %R0, %R1, %R1, lsl #1 (Thumb-2)
+ "&& reload_completed"
+ [(parallel [(set (reg:CC CC_REGNUM)
+ (compare:CC (const_int 0) (match_dup 1)))
+@@ -4390,6 +4725,20 @@
(set_attr "type" "multiple")]
)
@@ -56209,7 +58006,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_expand "negsi2"
[(set (match_operand:SI 0 "s_register_operand" "")
(neg:SI (match_operand:SI 1 "s_register_operand" "")))]
-@@ -4412,7 +4748,7 @@
+@@ -4412,7 +4761,7 @@
(define_expand "negsf2"
[(set (match_operand:SF 0 "s_register_operand" "")
(neg:SF (match_operand:SF 1 "s_register_operand" "")))]
@@ -56218,7 +58015,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
""
)
-@@ -4685,7 +5021,7 @@
+@@ -4685,7 +5034,7 @@
(define_expand "sqrtsf2"
[(set (match_operand:SF 0 "s_register_operand" "")
(sqrt:SF (match_operand:SF 1 "s_register_operand" "")))]
@@ -56227,7 +58024,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"")
(define_expand "sqrtdf2"
-@@ -4854,7 +5190,7 @@
+@@ -4854,7 +5203,7 @@
""
)
@@ -56236,7 +58033,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_expand "truncdfhf2"
[(set (match_operand:HF 0 "general_operand" "")
(float_truncate:HF
-@@ -5117,7 +5453,7 @@
+@@ -5117,7 +5466,7 @@
(match_operator 5 "subreg_lowpart_operator"
[(match_operand:SI 4 "s_register_operand" "")]))))]
"TARGET_32BIT
@@ -56245,7 +58042,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
== (GET_MODE_MASK (GET_MODE (operands[5]))
& (GET_MODE_MASK (GET_MODE (operands[5]))
<< (INTVAL (operands[2])))))"
-@@ -5361,7 +5697,7 @@
+@@ -5361,7 +5710,7 @@
""
)
@@ -56254,7 +58051,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_expand "extendhfdf2"
[(set (match_operand:DF 0 "general_operand" "")
(float_extend:DF (match_operand:HF 1 "general_operand" "")))]
-@@ -5490,7 +5826,7 @@
+@@ -5490,7 +5839,7 @@
[(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, r, q, m")
(match_operand:DI 1 "di_operand" "rDa,Db,Dc,mi,q"))]
"TARGET_32BIT
@@ -56263,7 +58060,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& !TARGET_IWMMXT
&& ( register_operand (operands[0], DImode)
|| register_operand (operands[1], DImode))"
-@@ -5699,12 +6035,15 @@
+@@ -5699,12 +6048,15 @@
;; LO_SUM adds in the high bits. Fortunately these are opaque operations
;; so this does not matter.
(define_insn "*arm_movt"
@@ -56285,7 +58082,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(set_attr "predicable_short_it" "no")
(set_attr "length" "4")
(set_attr "type" "alu_sreg")]
-@@ -5713,8 +6052,7 @@
+@@ -5713,8 +6065,7 @@
(define_insn "*arm_movsi_insn"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m")
(match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk"))]
@@ -56295,7 +58092,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& ( register_operand (operands[0], SImode)
|| register_operand (operands[1], SImode))"
"@
-@@ -5726,6 +6064,7 @@
+@@ -5726,6 +6077,7 @@
str%?\\t%1, %0"
[(set_attr "type" "mov_reg,mov_imm,mvn_imm,mov_imm,load1,store1")
(set_attr "predicable" "yes")
@@ -56303,7 +58100,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(set_attr "pool_range" "*,*,*,*,4096,*")
(set_attr "neg_pool_range" "*,*,*,*,4084,*")]
)
-@@ -5762,7 +6101,8 @@
+@@ -5762,7 +6114,8 @@
[(set (match_operand:SI 0 "arm_general_register_operand" "")
(const:SI (plus:SI (match_operand:SI 1 "general_operand" "")
(match_operand:SI 2 "const_int_operand" ""))))]
@@ -56313,7 +58110,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& arm_disable_literal_pool
&& reload_completed
&& GET_CODE (operands[1]) == SYMBOL_REF"
-@@ -5793,8 +6133,7 @@
+@@ -5793,8 +6146,7 @@
(define_split
[(set (match_operand:SI 0 "arm_general_register_operand" "")
(match_operand:SI 1 "general_operand" ""))]
@@ -56323,7 +58120,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& !flag_pic && !target_word_relocations
&& !arm_tls_referenced_p (operands[1])"
[(clobber (const_int 0))]
-@@ -6362,7 +6701,7 @@
+@@ -6362,7 +6714,7 @@
[(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m,r")
(match_operand:HI 1 "general_operand" "rIk,K,n,r,mi"))]
"TARGET_ARM
@@ -56332,7 +58129,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& (register_operand (operands[0], HImode)
|| register_operand (operands[1], HImode))"
"@
-@@ -6388,7 +6727,7 @@
+@@ -6388,7 +6740,7 @@
(define_insn "*movhi_bytes"
[(set (match_operand:HI 0 "s_register_operand" "=r,r,r")
(match_operand:HI 1 "arm_rhs_operand" "I,rk,K"))]
@@ -56341,7 +58138,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"@
mov%?\\t%0, %1\\t%@ movhi
mov%?\\t%0, %1\\t%@ movhi
-@@ -6396,7 +6735,7 @@
+@@ -6396,7 +6748,7 @@
[(set_attr "predicable" "yes")
(set_attr "type" "mov_imm,mov_reg,mvn_imm")]
)
@@ -56350,7 +58147,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
;; We use a DImode scratch because we may occasionally need an additional
;; temporary if the address isn't offsettable -- push_reload doesn't seem
;; to take any notice of the "o" constraints on reload_memory_operand operand.
-@@ -6518,7 +6857,7 @@
+@@ -6518,7 +6870,7 @@
strb%?\\t%1, %0"
[(set_attr "type" "mov_reg,mov_reg,mov_imm,mov_imm,mvn_imm,load1,store1,load1,store1")
(set_attr "predicable" "yes")
@@ -56359,7 +58156,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(set_attr "arch" "t2,any,any,t2,any,t2,t2,any,any")
(set_attr "length" "2,4,4,2,4,2,2,4,4")]
)
-@@ -6548,7 +6887,7 @@
+@@ -6548,7 +6900,7 @@
(define_insn "*arm32_movhf"
[(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,r,r")
(match_operand:HF 1 "general_operand" " m,r,r,F"))]
@@ -56368,7 +58165,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& ( s_register_operand (operands[0], HFmode)
|| s_register_operand (operands[1], HFmode))"
"*
-@@ -6892,7 +7231,7 @@
+@@ -6892,7 +7244,7 @@
[(set (pc) (if_then_else
(match_operator 0 "expandable_comparison_operator"
[(match_operand:SF 1 "s_register_operand" "")
@@ -56377,7 +58174,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(label_ref (match_operand 3 "" ""))
(pc)))]
"TARGET_32BIT && TARGET_HARD_FLOAT"
-@@ -6904,7 +7243,7 @@
+@@ -6904,7 +7256,7 @@
[(set (pc) (if_then_else
(match_operator 0 "expandable_comparison_operator"
[(match_operand:DF 1 "s_register_operand" "")
@@ -56386,7 +58183,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(label_ref (match_operand 3 "" ""))
(pc)))]
"TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
-@@ -7366,11 +7705,29 @@
+@@ -7366,11 +7718,29 @@
DONE;
}")
@@ -56417,7 +58214,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"TARGET_32BIT && TARGET_HARD_FLOAT"
"emit_insn (gen_cstore_cc (operands[0], operands[1],
operands[2], operands[3])); DONE;"
-@@ -7380,7 +7737,7 @@
+@@ -7380,7 +7750,7 @@
[(set (match_operand:SI 0 "s_register_operand" "")
(match_operator:SI 1 "expandable_comparison_operator"
[(match_operand:DF 2 "s_register_operand" "")
@@ -56426,7 +58223,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
"emit_insn (gen_cstore_cc (operands[0], operands[1],
operands[2], operands[3])); DONE;"
-@@ -7418,9 +7775,31 @@
+@@ -7418,9 +7788,31 @@
rtx ccreg;
if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0),
@@ -56460,7 +58257,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
code = GET_CODE (operands[1]);
ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0),
XEXP (operands[1], 1), NULL_RTX);
-@@ -7439,7 +7818,7 @@
+@@ -7439,7 +7831,7 @@
enum rtx_code code = GET_CODE (operands[1]);
rtx ccreg;
@@ -56469,7 +58266,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&XEXP (operands[1], 1)))
FAIL;
-@@ -7504,6 +7883,37 @@
+@@ -7504,6 +7896,37 @@
(set_attr "type" "fcsel")]
)
@@ -56507,7 +58304,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn_and_split "*movsicc_insn"
[(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r")
(if_then_else:SI
-@@ -7627,6 +8037,7 @@
+@@ -7627,6 +8050,7 @@
"
{
rtx callee, pat;
@@ -56515,7 +58312,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* In an untyped call, we can get NULL for operand 2. */
if (operands[2] == NULL_RTX)
-@@ -7641,8 +8052,17 @@
+@@ -7641,8 +8065,17 @@
: !REG_P (callee))
XEXP (operands[0], 0) = force_reg (Pmode, callee);
@@ -56535,7 +58332,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DONE;
}"
)
-@@ -7653,6 +8073,24 @@
+@@ -7653,6 +8086,24 @@
(use (match_operand 2 "" ""))
(clobber (reg:SI LR_REGNUM))])])
@@ -56560,7 +58357,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "*call_reg_armv5"
[(call (mem:SI (match_operand:SI 0 "s_register_operand" "r"))
(match_operand 1 "" ""))
-@@ -7688,6 +8126,7 @@
+@@ -7688,6 +8139,7 @@
"
{
rtx pat, callee;
@@ -56568,7 +58365,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* In an untyped call, we can get NULL for operand 2. */
if (operands[3] == 0)
-@@ -7702,9 +8141,18 @@
+@@ -7702,9 +8154,18 @@
: !REG_P (callee))
XEXP (operands[1], 0) = force_reg (Pmode, callee);
@@ -56590,7 +58387,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DONE;
}"
)
-@@ -7716,6 +8164,25 @@
+@@ -7716,6 +8177,25 @@
(use (match_operand 3 "" ""))
(clobber (reg:SI LR_REGNUM))])])
@@ -56616,7 +58413,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "*call_value_reg_armv5"
[(set (match_operand 0 "" "")
(call (mem:SI (match_operand:SI 1 "s_register_operand" "r"))
-@@ -8153,8 +8620,8 @@
+@@ -8153,8 +8633,8 @@
)
(define_insn "probe_stack"
@@ -56627,7 +58424,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"TARGET_32BIT"
"str%?\\tr0, %0"
[(set_attr "type" "store1")
-@@ -10221,8 +10688,8 @@
+@@ -10221,8 +10701,8 @@
(match_operand 1 "const_int_operand" "")))
(clobber (match_scratch:SI 2 ""))]
"TARGET_ARM
@@ -56638,7 +58435,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
[(set (match_dup 2) (zero_extend:SI (match_dup 0)))
(set (reg:CC CC_REGNUM) (compare:CC (match_dup 2) (match_dup 1)))]
"
-@@ -10562,7 +11029,11 @@
+@@ -10562,7 +11042,11 @@
}
"
[(set_attr "type" "load4")
@@ -56651,7 +58448,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
;; Pop with return (as used in epilogue RTL)
-@@ -10591,7 +11062,10 @@
+@@ -10591,7 +11075,10 @@
}
"
[(set_attr "type" "load4")
@@ -56663,7 +58460,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
(define_insn "*pop_multiple_with_return"
-@@ -10611,7 +11085,10 @@
+@@ -10611,7 +11098,10 @@
}
"
[(set_attr "type" "load4")
@@ -56675,7 +58472,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
)
;; Load into PC and return
-@@ -10632,7 +11109,7 @@
+@@ -10632,7 +11122,7 @@
(match_operand:SI 2 "const_int_I_operand" "I")))
(set (match_operand:DF 3 "vfp_hard_register_operand" "")
(mem:DF (match_dup 1)))])]
@@ -56684,7 +58481,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"*
{
int num_regs = XVECLEN (operands[0], 0);
-@@ -10822,19 +11299,22 @@
+@@ -10822,19 +11312,22 @@
(set_attr "predicable_short_it" "no")
(set_attr "type" "clz")])
@@ -56716,644 +58513,18498 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ DONE;
+")
- ;; V5E instructions.
+ ;; V5E instructions.
+
+@@ -10958,13 +11451,16 @@
+ ;; We only care about the lower 16 bits of the constant
+ ;; being inserted into the upper 16 bits of the register.
+ (define_insn "*arm_movtas_ze"
+- [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
++ [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r,r")
+ (const_int 16)
+ (const_int 16))
+ (match_operand:SI 1 "const_int_operand" ""))]
+- "arm_arch_thumb2"
+- "movt%?\t%0, %L1"
+- [(set_attr "predicable" "yes")
++ "TARGET_HAVE_MOVT"
++ "@
++ movt%?\t%0, %L1
++ movt\t%0, %L1"
++ [(set_attr "arch" "32,v8mb")
++ (set_attr "predicable" "yes")
+ (set_attr "predicable_short_it" "no")
+ (set_attr "length" "4")
+ (set_attr "type" "alu_sreg")]
+--- a/src/gcc/config/arm/arm.opt
++++ b/src/gcc/config/arm/arm.opt
+@@ -109,6 +109,10 @@ mfloat-abi=
+ Target RejectNegative Joined Enum(float_abi_type) Var(arm_float_abi) Init(TARGET_DEFAULT_FLOAT_ABI)
+ Specify if floating point hardware should be used.
+
++mcmse
++Target RejectNegative Var(use_cmse)
++Specify that the compiler should target secure code as per ARMv8-M Security Extensions.
++
+ Enum
+ Name(float_abi_type) Type(enum float_abi_type)
+ Known floating-point ABIs (for use with the -mfloat-abi= option):
+@@ -253,14 +257,6 @@ mrestrict-it
+ Target Report Var(arm_restrict_it) Init(2) Save
+ Generate IT blocks appropriate for ARMv8.
+
+-mold-rtx-costs
+-Target Report Mask(OLD_RTX_COSTS)
+-Use the old RTX costing tables (transitional).
+-
+-mnew-generic-costs
+-Target Report Mask(NEW_GENERIC_COSTS)
+-Use the new generic RTX cost tables if new core-specific cost table not available (transitional).
+-
+ mfix-cortex-m3-ldrd
+ Target Report Var(fix_cm3_ldrd) Init(2)
+ Avoid overlapping destination and address registers on LDRD instructions
+--- /dev/null
++++ b/src/gcc/config/arm/arm_cmse.h
+@@ -0,0 +1,199 @@
++/* ARMv8-M Secure Extensions intrinsics include file.
++
++ Copyright (C) 2015-2016 Free Software Foundation, Inc.
++ Contributed by ARM Ltd.
++
++ This file is part of GCC.
++
++ GCC is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published
++ by the Free Software Foundation; either version 3, or (at your
++ option) any later version.
++
++ GCC is distributed in the hope that it will be useful, but WITHOUT
++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
++ License for more details.
++
++ Under Section 7 of GPL version 3, you are granted additional
++ permissions described in the GCC Runtime Library Exception, version
++ 3.1, as published by the Free Software Foundation.
++
++ You should have received a copy of the GNU General Public License and
++ a copy of the GCC Runtime Library Exception along with this program;
++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
++ <http://www.gnu.org/licenses/>. */
++
++
++#ifndef _GCC_ARM_CMSE_H
++#define _GCC_ARM_CMSE_H
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#if __ARM_FEATURE_CMSE & 1
++
++#include <stddef.h>
++#include <stdint.h>
++
++#ifdef __ARM_BIG_ENDIAN
++
++typedef union {
++ struct cmse_address_info {
++#if __ARM_FEATURE_CMSE & 2
++ unsigned idau_region:8;
++ unsigned idau_region_valid:1;
++ unsigned secure:1;
++ unsigned nonsecure_readwrite_ok:1;
++ unsigned nonsecure_read_ok:1;
++#else
++ unsigned :12;
++#endif
++ unsigned readwrite_ok:1;
++ unsigned read_ok:1;
++#if __ARM_FEATURE_CMSE & 2
++ unsigned sau_region_valid:1;
++#else
++ unsigned :1;
++#endif
++ unsigned mpu_region_valid:1;
++#if __ARM_FEATURE_CMSE & 2
++ unsigned sau_region:8;
++#else
++ unsigned :8;
++#endif
++ unsigned mpu_region:8;
++ } flags;
++ unsigned value;
++} cmse_address_info_t;
++
++#else
++
++typedef union {
++ struct cmse_address_info {
++ unsigned mpu_region:8;
++#if __ARM_FEATURE_CMSE & 2
++ unsigned sau_region:8;
++#else
++ unsigned :8;
++#endif
++ unsigned mpu_region_valid:1;
++#if __ARM_FEATURE_CMSE & 2
++ unsigned sau_region_valid:1;
++#else
++ unsigned :1;
++#endif
++ unsigned read_ok:1;
++ unsigned readwrite_ok:1;
++#if __ARM_FEATURE_CMSE & 2
++ unsigned nonsecure_read_ok:1;
++ unsigned nonsecure_readwrite_ok:1;
++ unsigned secure:1;
++ unsigned idau_region_valid:1;
++ unsigned idau_region:8;
++#else
++ unsigned :12;
++#endif
++ } flags;
++ unsigned value;
++} cmse_address_info_t;
++
++#endif /* __ARM_BIG_ENDIAN */
++
++#define cmse_TT_fptr(p) (__cmse_TT_fptr ((__cmse_fptr)(p)))
++
++typedef void (*__cmse_fptr)(void);
++
++#define __CMSE_TT_ASM(flags) \
++{ \
++ cmse_address_info_t __result; \
++ __asm__ ("tt" # flags " %0,%1" \
++ : "=r"(__result) \
++ : "r"(__p) \
++ : "memory"); \
++ return __result; \
++}
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++__cmse_TT_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM ()
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++cmse_TT (void *__p)
++__CMSE_TT_ASM ()
++
++#define cmse_TTT_fptr(p) (__cmse_TTT_fptr ((__cmse_fptr)(p)))
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++__cmse_TTT_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM (t)
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++cmse_TTT (void *__p)
++__CMSE_TT_ASM (t)
++
++#if __ARM_FEATURE_CMSE & 2
++
++#define cmse_TTA_fptr(p) (__cmse_TTA_fptr ((__cmse_fptr)(p)))
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++__cmse_TTA_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM (a)
++
++__extension__ static __inline __attribute__ ((__always_inline__))
++cmse_address_info_t
++cmse_TTA (void *__p)
++__CMSE_TT_ASM (a)
++
++#define cmse_TTAT_fptr(p) (__cmse_TTAT_fptr ((__cmse_fptr)(p)))
++
++__extension__ static __inline cmse_address_info_t
++__attribute__ ((__always_inline__))
++__cmse_TTAT_fptr (__cmse_fptr __p)
++__CMSE_TT_ASM (at)
++
++__extension__ static __inline cmse_address_info_t
++__attribute__ ((__always_inline__))
++cmse_TTAT (void *__p)
++__CMSE_TT_ASM (at)
++
++/* FIXME: diagnose use outside cmse_nonsecure_entry functions. */
++__extension__ static __inline int __attribute__ ((__always_inline__))
++cmse_nonsecure_caller (void)
++{
++ return __builtin_arm_cmse_nonsecure_caller ();
++}
++
++#define CMSE_AU_NONSECURE 2
++#define CMSE_MPU_NONSECURE 16
++#define CMSE_NONSECURE 18
++
++#define cmse_nsfptr_create(p) ((typeof ((p))) ((intptr_t) (p) & ~1))
++
++#define cmse_is_nsfptr(p) (!((intptr_t) (p) & 1))
++
++#endif /* __ARM_FEATURE_CMSE & 2 */
++
++#define CMSE_MPU_UNPRIV 4
++#define CMSE_MPU_READWRITE 1
++#define CMSE_MPU_READ 8
++
++__extension__ void *
++cmse_check_address_range (void *, size_t, int);
++
++#define cmse_check_pointed_object(p, f) \
++ ((typeof ((p))) cmse_check_address_range ((p), sizeof (*(p)), (f)))
++
++#endif /* __ARM_FEATURE_CMSE & 1 */
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif /* _GCC_ARM_CMSE_H */
+--- /dev/null
++++ b/src/gcc/config/arm/arm_fp16.h
+@@ -0,0 +1,255 @@
++/* ARM FP16 intrinsics include file.
++
++ Copyright (C) 2016 Free Software Foundation, Inc.
++ Contributed by ARM Ltd.
++
++ This file is part of GCC.
++
++ GCC is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published
++ by the Free Software Foundation; either version 3, or (at your
++ option) any later version.
++
++ GCC is distributed in the hope that it will be useful, but WITHOUT
++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
++ License for more details.
++
++ Under Section 7 of GPL version 3, you are granted additional
++ permissions described in the GCC Runtime Library Exception, version
++ 3.1, as published by the Free Software Foundation.
++
++ You should have received a copy of the GNU General Public License and
++ a copy of the GCC Runtime Library Exception along with this program;
++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
++ <http://www.gnu.org/licenses/>. */
++
++#ifndef _GCC_ARM_FP16_H
++#define _GCC_ARM_FP16_H 1
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#include <stdint.h>
++
++/* Intrinsics for FP16 instructions. */
++#pragma GCC push_options
++#pragma GCC target ("fpu=fp-armv8")
++
++#if defined (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
++
++typedef __fp16 float16_t;
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vabsh_f16 (float16_t __a)
++{
++ return __builtin_neon_vabshf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vaddh_f16 (float16_t __a, float16_t __b)
++{
++ return __a + __b;
++}
++
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvtah_s32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtahssi (__a);
++}
++
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvtah_u32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtahusi (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vcvth_f16_s32 (int32_t __a)
++{
++ return __builtin_neon_vcvthshf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vcvth_f16_u32 (uint32_t __a)
++{
++ return __builtin_neon_vcvthuhf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vcvth_n_f16_s32 (int32_t __a, const int __b)
++{
++ return __builtin_neon_vcvths_nhf (__a, __b);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vcvth_n_f16_u32 (uint32_t __a, const int __b)
++{
++ return __builtin_neon_vcvthu_nhf ((int32_t)__a, __b);
++}
++
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvth_n_s32_f16 (float16_t __a, const int __b)
++{
++ return __builtin_neon_vcvths_nsi (__a, __b);
++}
++
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvth_n_u32_f16 (float16_t __a, const int __b)
++{
++ return (uint32_t)__builtin_neon_vcvthu_nsi (__a, __b);
++}
++
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvth_s32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvthssi (__a);
++}
++
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvth_u32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvthusi (__a);
++}
++
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvtmh_s32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtmhssi (__a);
++}
++
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvtmh_u32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtmhusi (__a);
++}
++
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvtnh_s32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtnhssi (__a);
++}
++
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvtnh_u32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtnhusi (__a);
++}
++
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++vcvtph_s32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtphssi (__a);
++}
++
++__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++vcvtph_u32_f16 (float16_t __a)
++{
++ return __builtin_neon_vcvtphusi (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vdivh_f16 (float16_t __a, float16_t __b)
++{
++ return __a / __b;
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
++{
++ return __builtin_neon_vfmahf (__a, __b, __c);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
++{
++ return __builtin_neon_vfmshf (__a, __b, __c);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vmaxnmh_f16 (float16_t __a, float16_t __b)
++{
++ return __builtin_neon_vmaxnmhf (__a, __b);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vminnmh_f16 (float16_t __a, float16_t __b)
++{
++ return __builtin_neon_vminnmhf (__a, __b);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vmulh_f16 (float16_t __a, float16_t __b)
++{
++ return __a * __b;
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vnegh_f16 (float16_t __a)
++{
++ return - __a;
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndah_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndahf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndh_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndhf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndih_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndihf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndmh_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndmhf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndnh_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndnhf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndph_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndphf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vrndxh_f16 (float16_t __a)
++{
++ return __builtin_neon_vrndxhf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vsqrth_f16 (float16_t __a)
++{
++ return __builtin_neon_vsqrthf (__a);
++}
++
++__extension__ static __inline float16_t __attribute__ ((__always_inline__))
++vsubh_f16 (float16_t __a, float16_t __b)
++{
++ return __a - __b;
++}
++
++#endif /* __ARM_FEATURE_FP16_SCALAR_ARITHMETIC */
++#pragma GCC pop_options
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
+--- a/src/gcc/config/arm/arm_neon.h
++++ b/src/gcc/config/arm/arm_neon.h
+@@ -38,6 +38,7 @@
+ extern "C" {
+ #endif
+
++#include <arm_fp16.h>
+ #include <stdint.h>
+
+ typedef __simd64_int8_t int8x8_t;
+@@ -509,528 +510,614 @@ typedef struct poly64x2x4_t
+ #pragma GCC pop_options
+
+ /* vadd */
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+ return __a + __b;
+ #else
+ return (float32x2_t) __builtin_neon_vaddv2sf (__a, __b);
+ #endif
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vadd_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+ return __a + __b;
+ #else
+ return (float32x4_t) __builtin_neon_vaddv4sf (__a, __b);
+ #endif
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a + __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vaddlsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vaddlsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vaddlsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vaddluv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vaddluv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vaddluv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddw_s8 (int16x8_t __a, int8x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vaddwsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddw_s16 (int32x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vaddwsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddw_s32 (int64x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vaddwsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vaddwuv8qi ((int16x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vaddwuv4hi ((int32x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vaddwuv2si ((int64x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhadd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vhaddsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhadd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vhaddsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhadd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vhaddsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vhadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vhadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vhadduv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhaddq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vhaddsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhaddq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vhaddsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhaddq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vhaddsv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vhadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vhadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vhadduv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhadd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vrhaddsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhadd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vrhaddsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhadd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vrhaddsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vrhadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vrhadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vrhadduv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vrhaddsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vrhaddsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vrhaddsv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vrhadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vrhadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vrhadduv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vqaddsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqaddsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqaddsv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vqaddsdi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqadduv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vqaddudi ((int64x1_t) __a, (int64x1_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vqaddsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqaddsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqaddsv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vqaddsv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vqadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vqadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vqadduv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vqadduv2di ((int64x2_t) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddhn_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddhn_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddhn_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vraddhn_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vraddhnv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vraddhn_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vraddhnv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vraddhn_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vraddhnv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vraddhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vraddhnv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vraddhnv2di ((int64x2_t) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+ return __a * __b;
+ #else
+ return (float32x2_t) __builtin_neon_vmulfv2sf (__a, __b);
+@@ -1038,493 +1125,574 @@ vmul_f32 (float32x2_t __a, float32x2_t __b)
+
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+ return __a * __b;
+ #else
+ return (float32x4_t) __builtin_neon_vmulfv4sf (__a, __b);
+ #endif
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a * __b;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ return (poly8x8_t)__builtin_neon_vmulpv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+ return (poly8x16_t)__builtin_neon_vmulpv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmulhv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmulhv2si (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmulhv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmulhv4si (__a, __b);
+ }
+
+ #ifdef __ARM_FEATURE_QRDMX
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmlahv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmlahv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmlahv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmlahv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmlshv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmlshv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmlshv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmlshv4si (__a, __b, __c);
+ }
+ #endif
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vmullsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vmullsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vmullsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vmulluv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vmulluv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vmulluv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ return (poly16x8_t)__builtin_neon_vmullpv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmull_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmull_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+ return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+ return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vmlalsv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmlalsv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vmlalsv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmlaluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmlaluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint64x2_t)__builtin_neon_vmlaluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+ return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+ return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vmlslsv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmlslsv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vmlslsv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmlsluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmlsluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint64x2_t)__builtin_neon_vmlsluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c);
+@@ -1532,25 +1700,29 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=neon-vfpv4")
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c);
+@@ -1558,7 +1730,8 @@ vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ #pragma GCC pop_options
+
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndn_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrintnv2sf (__a);
+@@ -1566,7 +1739,8 @@ vrndn_f32 (float32x2_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndnq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrintnv4sf (__a);
+@@ -1574,7 +1748,8 @@ vrndnq_f32 (float32x4_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrnda_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrintav2sf (__a);
+@@ -1582,7 +1757,8 @@ vrnda_f32 (float32x2_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndaq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrintav4sf (__a);
+@@ -1590,7 +1766,8 @@ vrndaq_f32 (float32x4_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndp_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrintpv2sf (__a);
+@@ -1598,7 +1775,8 @@ vrndp_f32 (float32x2_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndpq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrintpv4sf (__a);
+@@ -1606,7 +1784,8 @@ vrndpq_f32 (float32x4_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndm_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrintmv2sf (__a);
+@@ -1614,7 +1793,8 @@ vrndm_f32 (float32x2_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndmq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrintmv4sf (__a);
+@@ -1623,7 +1803,8 @@ vrndmq_f32 (float32x4_t __a)
+ #endif
+
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndx_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrintxv2sf (__a);
+@@ -1632,7 +1813,8 @@ vrndx_f32 (float32x2_t __a)
+ #endif
+
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndxq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrintxv4sf (__a);
+@@ -1641,7 +1823,8 @@ vrndxq_f32 (float32x4_t __a)
+ #endif
+
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrnd_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrintzv2sf (__a);
+@@ -1649,7 +1832,8 @@ vrnd_f32 (float32x2_t __a)
+
+ #endif
+ #if __ARM_ARCH >= 8
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrndq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrintzv4sf (__a);
+@@ -1657,2907 +1841,3436 @@ vrndq_f32 (float32x4_t __a)
+
+ #endif
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+ return __a - __b;
+ #else
+ return (float32x2_t) __builtin_neon_vsubv2sf (__a, __b);
+ #endif
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsub_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-#ifdef __FAST_MATH
++#ifdef __FAST_MATH__
+ return __a - __b;
+ #else
+ return (float32x4_t) __builtin_neon_vsubv4sf (__a, __b);
+ #endif
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a - __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vsublsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vsublsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vsublsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vsubluv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vsubluv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vsubluv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubw_s8 (int16x8_t __a, int8x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vsubwsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubw_s16 (int32x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vsubwsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubw_s32 (int64x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vsubwsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vsubwuv8qi ((int16x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vsubwuv4hi ((int32x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vsubwuv2si ((int64x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsub_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vhsubsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsub_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vhsubsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsub_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vhsubsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vhsubuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vhsubuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vhsubuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsubq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vhsubsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsubq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vhsubsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsubq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vhsubsv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vhsubuv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vhsubuv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vhsubuv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vqsubsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqsubsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqsubsv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vqsubsdi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqsubuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqsubuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqsubuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vqsubudi ((int64x1_t) __a, (int64x1_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vqsubsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqsubsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqsubsv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vqsubsv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vqsubuv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vqsubuv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vqsubuv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vqsubuv2di ((int64x2_t) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubhn_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubhn_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubhn_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vrsubhnv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vrsubhnv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vrsubhnv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vrsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vrsubhnv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vrsubhnv2di ((int64x2_t) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgeuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgeuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcge_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgeuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgeuv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgeuv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgeuv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgeuv8qi ((int8x8_t) __b, (int8x8_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgeuv4hi ((int16x4_t) __b, (int16x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcle_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgeuv2si ((int32x2_t) __b, (int32x2_t) __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgeuv16qi ((int8x16_t) __b, (int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgeuv8hi ((int16x8_t) __b, (int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgeuv4si ((int32x4_t) __b, (int32x4_t) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgtuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgtuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgtuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgtuv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgtuv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgtuv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vcgtuv8qi ((int8x8_t) __b, (int8x8_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vcgtuv4hi ((int16x4_t) __b, (int16x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclt_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcgtuv2si ((int32x2_t) __b, (int32x2_t) __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcgtuv16qi ((int8x16_t) __b, (int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcgtuv8hi ((int16x8_t) __b, (int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcgtuv4si ((int32x4_t) __b, (int32x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcage_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcageq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcale_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcaleq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcagt_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcagtq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcalt_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcaltq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vtst_p16 (poly16x4_t __a, poly16x4_t __b)
++{
++ return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
++{
++ return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
++}
++
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vabdsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vabdsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vabdsv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vabdfv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vabduv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vabduv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vabduv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vabdsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vabdsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vabdsv4si (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vabdfv4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vabduv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vabduv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vabduv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vabdlsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vabdlsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vabdlsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vabdluv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vabdluv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vabdluv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int8x8_t)__builtin_neon_vabasv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vabasv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vabasv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint8x8_t)__builtin_neon_vabauv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint16x4_t)__builtin_neon_vabauv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint32x2_t)__builtin_neon_vabauv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+ return (int8x16_t)__builtin_neon_vabasv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vabasv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vabasv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+ return (uint8x16_t)__builtin_neon_vabauv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vabauv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vabauv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vabalsv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vabalsv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vabalsv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vabaluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vabaluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint64x2_t)__builtin_neon_vabaluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vmaxsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vmaxsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vmaxsv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vmaxfv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vmaxuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vmaxuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmax_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vmaxuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vmaxsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vmaxsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vmaxsv4si (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vmaxfv4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++#pragma GCC push_options
++#pragma GCC target ("fpu=neon-fp-armv8")
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmaxnm_f32 (float32x2_t a, float32x2_t b)
++{
++ return (float32x2_t)__builtin_neon_vmaxnmv2sf (a, b);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmaxnmq_f32 (float32x4_t a, float32x4_t b)
++{
++ return (float32x4_t)__builtin_neon_vmaxnmv4sf (a, b);
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vminnm_f32 (float32x2_t a, float32x2_t b)
++{
++ return (float32x2_t)__builtin_neon_vminnmv2sf (a, b);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vminnmq_f32 (float32x4_t a, float32x4_t b)
++{
++ return (float32x4_t)__builtin_neon_vminnmv4sf (a, b);
++}
++#pragma GCC pop_options
++
++
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vmaxuv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vmaxuv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vmaxuv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vminsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vminsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vminsv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vminfv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vminuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vminuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmin_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vminuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vminsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vminsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vminsv4si (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vminfv4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vminuv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vminuv8hi ((int16x8_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vminq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vminuv4si ((int32x4_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddl_s8 (int8x8_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vpaddlsv8qi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddl_s16 (int16x4_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vpaddlsv4hi (__a);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddl_s32 (int32x2_t __a)
+ {
+ return (int64x1_t)__builtin_neon_vpaddlsv2si (__a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddl_u8 (uint8x8_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vpaddluv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddl_u16 (uint16x4_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vpaddluv4hi ((int16x4_t) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddl_u32 (uint32x2_t __a)
+ {
+ return (uint64x1_t)__builtin_neon_vpaddluv2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddlq_s8 (int8x16_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vpaddlsv16qi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddlq_s16 (int16x8_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vpaddlsv8hi (__a);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddlq_s32 (int32x4_t __a)
+ {
+ return (int64x2_t)__builtin_neon_vpaddlsv4si (__a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddlq_u8 (uint8x16_t __a)
+ {
+ return (uint16x8_t)__builtin_neon_vpaddluv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddlq_u16 (uint16x8_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vpaddluv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpaddlq_u32 (uint32x4_t __a)
+ {
+ return (uint64x2_t)__builtin_neon_vpaddluv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadal_s8 (int16x4_t __a, int8x8_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vpadalsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadal_s16 (int32x2_t __a, int16x4_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vpadalsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadal_s32 (int64x1_t __a, int32x2_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vpadalsv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vpadaluv8qi ((int16x4_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vpadaluv4hi ((int32x2_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vpadaluv2si ((int64x1_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadalq_s8 (int16x8_t __a, int8x16_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vpadalsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadalq_s16 (int32x4_t __a, int16x8_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vpadalsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadalq_s32 (int64x2_t __a, int32x4_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vpadalsv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vpadaluv16qi ((int16x8_t) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vpadaluv8hi ((int32x4_t) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vpadaluv4si ((int64x2_t) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vpmaxsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vpmaxsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vpmaxsv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vpmaxfv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vpmaxuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vpmaxuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vpmaxuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vpminsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vpminsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vpminsv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vpminfv2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vpminuv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vpminuv4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vpminuv2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrecps_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vshlsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vshlsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vshlsv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vshlsdi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_u8 (uint8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vshluv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_u16 (uint16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vshluv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_u32 (uint32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vshluv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_u64 (uint64x1_t __a, int64x1_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vshludi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vshlsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vshlsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vshlsv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vshlsv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_u8 (uint8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vshluv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_u16 (uint16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vshluv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_u32 (uint32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vshluv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_u64 (uint64x2_t __a, int64x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vshluv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vrshlsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vrshlsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vrshlsv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vrshlsdi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_u8 (uint8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vrshluv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_u16 (uint16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vrshluv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_u32 (uint32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vrshluv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshl_u64 (uint64x1_t __a, int64x1_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vrshludi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vrshlsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vrshlsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vrshlsv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vrshlsv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vrshluv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vrshluv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vrshluv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vrshluv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vqshlsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqshlsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqshlsv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vqshlsdi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_u8 (uint8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqshluv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_u16 (uint16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqshluv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_u32 (uint32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqshluv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_u64 (uint64x1_t __a, int64x1_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vqshludi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vqshlsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqshlsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqshlsv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vqshlsv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vqshluv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vqshluv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vqshluv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vqshluv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vqrshlsv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqrshlsv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqrshlsv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x1_t)__builtin_neon_vqrshlsdi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqrshluv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqrshluv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqrshluv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
+ {
+ return (uint64x1_t)__builtin_neon_vqrshludi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vqrshlsv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqrshlsv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqrshlsv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vqrshlsv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vqrshluv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vqrshluv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vqrshluv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vqrshluv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vshrs_nv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vshrs_nv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vshrs_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64x1_t)__builtin_neon_vshrs_ndi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vshru_nv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vshru_nv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vshru_nv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshr_n_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64x1_t)__builtin_neon_vshru_ndi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_s8 (int8x16_t __a, const int __b)
+ {
+ return (int8x16_t)__builtin_neon_vshrs_nv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int16x8_t)__builtin_neon_vshrs_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vshrs_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int64x2_t)__builtin_neon_vshrs_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_u8 (uint8x16_t __a, const int __b)
+ {
+ return (uint8x16_t)__builtin_neon_vshru_nv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vshru_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vshru_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrq_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vshru_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vrshrs_nv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vrshrs_nv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vrshrs_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64x1_t)__builtin_neon_vrshrs_ndi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vrshru_nv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vrshru_nv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vrshru_nv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshr_n_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64x1_t)__builtin_neon_vrshru_ndi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_s8 (int8x16_t __a, const int __b)
+ {
+ return (int8x16_t)__builtin_neon_vrshrs_nv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int16x8_t)__builtin_neon_vrshrs_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vrshrs_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int64x2_t)__builtin_neon_vrshrs_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_u8 (uint8x16_t __a, const int __b)
+ {
+ return (uint8x16_t)__builtin_neon_vrshru_nv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vrshru_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vrshru_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrq_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vrshru_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrn_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrn_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrn_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrn_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrn_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshrn_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrn_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vrshrn_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrn_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vrshrn_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrn_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vrshrn_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrn_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vrshrn_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrn_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vrshrn_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrshrn_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vrshrn_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrn_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vqshrns_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrn_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vqshrns_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrn_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vqshrns_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrn_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqshrnu_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrn_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqshrnu_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrn_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqshrnu_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrn_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vqrshrns_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrn_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vqrshrns_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrn_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vqrshrns_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrn_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqrshrnu_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrn_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqrshrnu_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrn_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqrshrnu_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrun_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrun_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshrun_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrun_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqrshrun_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrun_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqrshrun_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrshrun_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqrshrun_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshl_n_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_s8 (int8x16_t __a, const int __b)
+ {
+ return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_u8 (uint8x16_t __a, const int __b)
+ {
+ return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshlq_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vqshl_s_nv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vqshl_s_nv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vqshl_s_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64x1_t)__builtin_neon_vqshl_s_ndi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqshl_u_nv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqshl_u_nv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqshl_u_nv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshl_n_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64x1_t)__builtin_neon_vqshl_u_ndi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_s8 (int8x16_t __a, const int __b)
+ {
+ return (int8x16_t)__builtin_neon_vqshl_s_nv16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (int16x8_t)__builtin_neon_vqshl_s_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vqshl_s_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (int64x2_t)__builtin_neon_vqshl_s_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_u8 (uint8x16_t __a, const int __b)
+ {
+ return (uint8x16_t)__builtin_neon_vqshl_u_nv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vqshl_u_nv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vqshl_u_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlq_n_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vqshl_u_nv2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlu_n_s8 (int8x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlu_n_s16 (int16x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlu_n_s32 (int32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshlu_n_s64 (int64x1_t __a, const int __b)
+ {
+ return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshluq_n_s8 (int8x16_t __a, const int __b)
+ {
+ return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshluq_n_s16 (int16x8_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshluq_n_s32 (int32x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqshluq_n_s64 (int64x2_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshll_n_s8 (int8x8_t __a, const int __b)
+ {
+ return (int16x8_t)__builtin_neon_vshlls_nv8qi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshll_n_s16 (int16x4_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vshlls_nv4hi (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshll_n_s32 (int32x2_t __a, const int __b)
+ {
+ return (int64x2_t)__builtin_neon_vshlls_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshll_n_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vshllu_nv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshll_n_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vshllu_nv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vshll_n_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vshllu_nv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vsras_nv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vsras_nv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vsras_nv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vsras_ndi (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vsrau_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vsrau_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vsrau_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vsrau_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vsras_nv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vsras_nv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vsras_nv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vsras_nv2di (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vsrau_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vsrau_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vsrau_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vsrau_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vrsras_nv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vrsras_nv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vrsras_nv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vrsras_ndi (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vrsrau_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vrsrau_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vrsrau_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vrsrau_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vrsras_nv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vrsras_nv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vrsras_nv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vrsras_nv2di (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vrsrau_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vrsrau_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vrsrau_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vrsrau_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+@@ -4565,68 +5278,79 @@ vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
+ {
+ return (poly64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+ {
+ return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+ {
+ return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+@@ -4634,68 +5358,79 @@ vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
+ {
+ return (poly64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+ {
+ return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+ {
+ return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+@@ -4703,68 +5438,79 @@ vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
+ {
+ return (poly64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+ {
+ return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+ {
+ return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+@@ -4772,530 +5518,618 @@ vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
+ {
+ return (poly64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+ {
+ return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+ {
+ return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabs_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vabsv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabs_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vabsv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabs_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vabsv2si (__a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabs_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vabsv2sf (__a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabsq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vabsv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabsq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vabsv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabsq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vabsv4si (__a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vabsq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vabsv4sf (__a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqabs_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vqabsv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqabs_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vqabsv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqabs_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vqabsv2si (__a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqabsq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vqabsv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqabsq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vqabsv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqabsq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vqabsv4si (__a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vneg_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vnegv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vneg_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vnegv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vneg_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vnegv2si (__a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vneg_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vnegv2sf (__a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vnegq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vnegv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vnegq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vnegv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vnegq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vnegv4si (__a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vnegq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vnegv4sf (__a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqneg_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vqnegv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqneg_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vqnegv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqneg_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vqnegv2si (__a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqnegq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vqnegv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqnegq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vqnegv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqnegq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vqnegv4si (__a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vmvnv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vmvnv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vmvnv2si (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_u8 (uint8x8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_u16 (uint16x4_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_u32 (uint32x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvn_p8 (poly8x8_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vmvnv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vmvnv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vmvnv4si (__a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_u8 (uint8x16_t __a)
+ {
+ return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_u16 (uint16x8_t __a)
+ {
+ return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_u32 (uint32x4_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmvnq_p8 (poly8x16_t __a)
+ {
+ return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcls_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vclsv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcls_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vclsv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcls_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vclsv2si (__a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclsq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vclsv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclsq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vclsv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclsq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vclsv4si (__a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclz_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vclzv8qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclz_s16 (int16x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vclzv4hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclz_s32 (int32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vclzv2si (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclz_u8 (uint8x8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclz_u16 (uint16x4_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclz_u32 (uint32x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclzq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vclzv16qi (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclzq_s16 (int16x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vclzv8hi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclzq_s32 (int32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vclzv4si (__a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclzq_u8 (uint8x16_t __a)
+ {
+ return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclzq_u16 (uint16x8_t __a)
+ {
+ return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vclzq_u32 (uint32x4_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcnt_s8 (int8x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vcntv8qi (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcnt_u8 (uint8x8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcnt_p8 (poly8x8_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcntq_s8 (int8x16_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vcntv16qi (__a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcntq_u8 (uint8x16_t __a)
+ {
+ return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcntq_p8 (poly8x16_t __a)
+ {
+ return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrecpe_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrecpev2sf (__a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrecpe_u32 (uint32x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrecpeq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrecpev4sf (__a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrecpeq_u32 (uint32x4_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsqrte_f32 (float32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsqrte_u32 (uint32x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsqrteq_f32 (float32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrsqrteq_u32 (uint32x4_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32_t)__builtin_neon_vget_lanev2si (__a, __b);
+@@ -5328,67 +6162,88 @@ vget_lane_s32 (int32x2_t __a, const int __b)
+ })
+ #endif
+
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_f32 (float32x2_t __a, const int __b)
+ {
+ return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8_t)__builtin_neon_vget_laneuv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16_t)__builtin_neon_vget_laneuv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32_t)__builtin_neon_vget_laneuv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_p8 (poly8x8_t __a, const int __b)
+ {
+ return (poly8_t)__builtin_neon_vget_laneuv8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_p16 (poly16x4_t __a, const int __b)
+ {
+ return (poly16_t)__builtin_neon_vget_laneuv4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64_t)__builtin_neon_vget_lanedi (__a, __b);
+ }
+
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
++#pragma GCC push_options
++#pragma GCC target ("fpu=crypto-neon-fp-armv8")
++__extension__ extern __inline poly64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vget_lane_p64 (poly64x1_t __a, const int __b)
++{
++ return (poly64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
++}
++
++#pragma GCC pop_options
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_lane_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_s8 (int8x16_t __a, const int __b)
+ {
+ return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_s16 (int16x8_t __a, const int __b)
+ {
+ return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_s32 (int32x4_t __a, const int __b)
+ {
+ return (int32_t)__builtin_neon_vget_lanev4si (__a, __b);
+@@ -5405,67 +6260,78 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
+ })
+ #endif
+
+-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_f32 (float32x4_t __a, const int __b)
+ {
+ return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_u8 (uint8x16_t __a, const int __b)
+ {
+ return (uint8_t)__builtin_neon_vget_laneuv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_u16 (uint16x8_t __a, const int __b)
+ {
+ return (uint16_t)__builtin_neon_vget_laneuv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_u32 (uint32x4_t __a, const int __b)
+ {
+ return (uint32_t)__builtin_neon_vget_laneuv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_p8 (poly8x16_t __a, const int __b)
+ {
+ return (poly8_t)__builtin_neon_vget_laneuv16qi ((int8x16_t) __a, __b);
+ }
+
+-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_p16 (poly16x8_t __a, const int __b)
+ {
+ return (poly16_t)__builtin_neon_vget_laneuv8hi ((int16x8_t) __a, __b);
+ }
+
+-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_s64 (int64x2_t __a, const int __b)
+ {
+ return (int64_t)__builtin_neon_vget_lanev2di (__a, __b);
+ }
+
+-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vgetq_lane_u64 (uint64x2_t __a, const int __b)
+ {
+ return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
+@@ -5483,67 +6349,78 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+ })
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
+ {
+ return (float32x2_t)__builtin_neon_vset_lanev2sf ((__builtin_neon_sf) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c)
+ {
+ return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c)
+ {
+ return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
+@@ -5561,49 +6438,57 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
+ })
+ #endif
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
+ {
+ return (float32x4_t)__builtin_neon_vset_lanev4sf ((__builtin_neon_sf) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c)
+ {
+ return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c)
+ {
+ return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c);
+@@ -5611,136 +6496,158 @@ vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_p64 (uint64_t __a)
+ {
+ return (poly64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_s8 (uint64_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_s16 (uint64_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_s32 (uint64_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_s64 (uint64_t __a)
+ {
+ return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_f16 (uint64_t __a)
+ {
+ return (float16x4_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_f32 (uint64_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_u8 (uint64_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_u16 (uint64_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_u32 (uint64_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_u64 (uint64_t __a)
+ {
+ return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_p8 (uint64_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcreate_p16 (uint64_t __a)
+ {
+ return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_s8 (int8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_s16 (int16_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_s32 (int32_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_f32 (float32_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_u8 (uint8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_u16 (uint16_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_u32 (uint32_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_p8 (poly8_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_p16 (poly16_t __a)
+ {
+ return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+@@ -5748,20 +6655,23 @@ vdup_n_p16 (poly16_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_p64 (poly64_t __a)
+ {
+ return (poly64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_s64 (int64_t __a)
+ {
+ return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_n_u64 (uint64_t __a)
+ {
+ return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+@@ -5769,260 +6679,303 @@ vdup_n_u64 (uint64_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_p64 (poly64_t __a)
+ {
+ return (poly64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_s8 (int8_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_s16 (int16_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_s32 (int32_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_f32 (float32_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_u8 (uint8_t __a)
+ {
+ return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_u16 (uint16_t __a)
+ {
+ return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_u32 (uint32_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_p8 (poly8_t __a)
+ {
+ return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_p16 (poly16_t __a)
+ {
+ return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_s64 (int64_t __a)
+ {
+ return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_n_u64 (uint64_t __a)
+ {
+ return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_s8 (int8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_s16 (int16_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_s32 (int32_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_f32 (float32_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_u8 (uint8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_u16 (uint16_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_u32 (uint32_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_p8 (poly8_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_p16 (poly16_t __a)
+ {
+ return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_s64 (int64_t __a)
+ {
+ return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmov_n_u64 (uint64_t __a)
+ {
+ return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_s8 (int8_t __a)
+ {
+ return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_s16 (int16_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_s32 (int32_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_f32 (float32_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_u8 (uint8_t __a)
+ {
+ return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_u16 (uint16_t __a)
+ {
+ return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_u32 (uint32_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_p8 (poly8_t __a)
+ {
+ return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_p16 (poly16_t __a)
+ {
+ return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_s64 (int64_t __a)
+ {
+ return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovq_n_u64 (uint64_t __a)
+ {
+ return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_f32 (float32x2_t __a, const int __b)
+ {
+ return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_p8 (poly8x8_t __a, const int __b)
+ {
+ return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_p16 (poly16x4_t __a, const int __b)
+ {
+ return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
+@@ -6030,74 +6983,86 @@ vdup_lane_p16 (poly16x4_t __a, const int __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_p64 (poly64x1_t __a, const int __b)
+ {
+ return (poly64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdup_lane_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_s8 (int8x8_t __a, const int __b)
+ {
+ return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_s16 (int16x4_t __a, const int __b)
+ {
+ return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_s32 (int32x2_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_f32 (float32x2_t __a, const int __b)
+ {
+ return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_u8 (uint8x8_t __a, const int __b)
+ {
+ return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_u16 (uint16x4_t __a, const int __b)
+ {
+ return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_u32 (uint32x2_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_p8 (poly8x8_t __a, const int __b)
+ {
+ return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_p16 (poly16x4_t __a, const int __b)
+ {
+ return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
+@@ -6105,20 +7070,23 @@ vdupq_lane_p16 (poly16x4_t __a, const int __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_p64 (poly64x1_t __a, const int __b)
+ {
+ return (poly64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_s64 (int64x1_t __a, const int __b)
+ {
+ return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vdupq_lane_u64 (uint64x1_t __a, const int __b)
+ {
+ return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b);
+@@ -6126,82 +7094,95 @@ vdupq_lane_u64 (uint64x1_t __a, const int __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
+ {
+ return (poly64x2_t)__builtin_neon_vcombinedi (__a, __b);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_f16 (float16x4_t __a, float16x4_t __b)
+ {
+ return __builtin_neon_vcombinev4hf (__a, __b);
+ }
+ #endif
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
+ {
+ return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
+@@ -6209,144 +7190,167 @@ vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_p64 (poly64x2_t __a)
+ {
+ return (poly64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_s8 (int8x16_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vget_highv16qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_s16 (int16x8_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_s32 (int32x4_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vget_highv4si (__a);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_s64 (int64x2_t __a)
+ {
+ return (int64x1_t)__builtin_neon_vget_highv2di (__a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_f16 (float16x8_t __a)
+ {
+ return __builtin_neon_vget_highv8hf (__a);
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_f32 (float32x4_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vget_highv4sf (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_u8 (uint8x16_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_u16 (uint16x8_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_u32 (uint32x4_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_u64 (uint64x2_t __a)
+ {
+ return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_p8 (poly8x16_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_high_p16 (poly16x8_t __a)
+ {
+ return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_s8 (int8x16_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_s16 (int16x8_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_s32 (int32x4_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_f16 (float16x8_t __a)
+ {
+ return __builtin_neon_vget_lowv8hf (__a);
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_f32 (float32x4_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_u8 (uint8x16_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_u16 (uint16x8_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_u32 (uint32x4_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_p8 (poly8x16_t __a)
+ {
+ return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_p16 (poly16x8_t __a)
+ {
+ return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
+@@ -6354,68 +7358,79 @@ vget_low_p16 (poly16x8_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_p64 (poly64x2_t __a)
+ {
+ return (poly64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_s64 (int64x2_t __a)
+ {
+ return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vget_low_u64 (uint64x2_t __a)
+ {
+ return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_s32_f32 (float32x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vcvtsv2sf (__a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_f32_s32 (int32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vcvtsv2si (__a);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_f32_u32 (uint32x2_t __a)
+ {
+ return (float32x2_t)__builtin_neon_vcvtuv2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_u32_f32 (float32x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vcvtuv2sf (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_s32_f32 (float32x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vcvtsv4sf (__a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_f32_s32 (int32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vcvtsv4si (__a);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_f32_u32 (uint32x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vcvtuv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_u32_f32 (float32x4_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vcvtuv4sf (__a);
+@@ -6424,7 +7439,8 @@ vcvtq_u32_f32 (float32x4_t __a)
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=neon-fp16")
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_f16_f32 (float32x4_t __a)
+ {
+ return (float16x4_t)__builtin_neon_vcvtv4hfv4sf (__a);
+@@ -6432,7 +7448,8 @@ vcvt_f16_f32 (float32x4_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_f32_f16 (float16x4_t __a)
+ {
+ return (float32x4_t)__builtin_neon_vcvtv4sfv4hf (__a);
+@@ -6440,1059 +7457,1232 @@ vcvt_f32_f16 (float16x4_t __a)
+ #endif
+ #pragma GCC pop_options
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_n_s32_f32 (float32x2_t __a, const int __b)
+ {
+ return (int32x2_t)__builtin_neon_vcvts_nv2sf (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_n_f32_s32 (int32x2_t __a, const int __b)
+ {
+ return (float32x2_t)__builtin_neon_vcvts_nv2si (__a, __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
+ {
+ return (float32x2_t)__builtin_neon_vcvtu_nv2si ((int32x2_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvt_n_u32_f32 (float32x2_t __a, const int __b)
+ {
+ return (uint32x2_t)__builtin_neon_vcvtu_nv2sf (__a, __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
+ {
+ return (int32x4_t)__builtin_neon_vcvts_nv4sf (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
+ {
+ return (float32x4_t)__builtin_neon_vcvts_nv4si (__a, __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
+ {
+ return (float32x4_t)__builtin_neon_vcvtu_nv4si ((int32x4_t) __a, __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
+ {
+ return (uint32x4_t)__builtin_neon_vcvtu_nv4sf (__a, __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovn_s16 (int16x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vmovnv8hi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovn_s32 (int32x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vmovnv4si (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovn_s64 (int64x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vmovnv2di (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovn_u16 (uint16x8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovn_u32 (uint32x4_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovn_u64 (uint64x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovn_s16 (int16x8_t __a)
+ {
+ return (int8x8_t)__builtin_neon_vqmovnsv8hi (__a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovn_s32 (int32x4_t __a)
+ {
+ return (int16x4_t)__builtin_neon_vqmovnsv4si (__a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovn_s64 (int64x2_t __a)
+ {
+ return (int32x2_t)__builtin_neon_vqmovnsv2di (__a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovn_u16 (uint16x8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vqmovnuv8hi ((int16x8_t) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovn_u32 (uint32x4_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vqmovnuv4si ((int32x4_t) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovn_u64 (uint64x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vqmovnuv2di ((int64x2_t) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovun_s16 (int16x8_t __a)
+ {
+ return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovun_s32 (int32x4_t __a)
+ {
+ return (uint16x4_t)__builtin_neon_vqmovunv4si (__a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqmovun_s64 (int64x2_t __a)
+ {
+ return (uint32x2_t)__builtin_neon_vqmovunv2di (__a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovl_s8 (int8x8_t __a)
+ {
+ return (int16x8_t)__builtin_neon_vmovlsv8qi (__a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovl_s16 (int16x4_t __a)
+ {
+ return (int32x4_t)__builtin_neon_vmovlsv4hi (__a);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovl_s32 (int32x2_t __a)
+ {
+ return (int64x2_t)__builtin_neon_vmovlsv2si (__a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovl_u8 (uint8x8_t __a)
+ {
+ return (uint16x8_t)__builtin_neon_vmovluv8qi ((int8x8_t) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovl_u16 (uint16x4_t __a)
+ {
+ return (uint32x4_t)__builtin_neon_vmovluv4hi ((int16x4_t) __a);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmovl_u32 (uint32x2_t __a)
+ {
+ return (uint64x2_t)__builtin_neon_vmovluv2si ((int32x2_t) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl1_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl1_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl1_p8 (poly8x8_t __a, uint8x8_t __b)
+ {
+ return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl2_s8 (int8x8x2_t __a, int8x8_t __b)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+ return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+ return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+ return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl3_s8 (int8x8x3_t __a, int8x8_t __b)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+ return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+ return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+ return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl4_s8 (int8x8x4_t __a, int8x8_t __b)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+ return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+ return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+ return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c)
+ {
+ return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c)
+ {
+ return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c)
+ {
+ return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
+ {
+ return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+ {
+ return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+ {
+ return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
+ {
+ return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
+ {
+ return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
+ {
+ return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vmlals_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int64x2_t)__builtin_neon_vmlals_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+ {
+ return (uint32x4_t)__builtin_neon_vmlalu_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+ {
+ return (uint64x2_t)__builtin_neon_vmlalu_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
+ {
+ return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+ {
+ return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+ {
+ return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
+ {
+ return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
+ {
+ return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
+ {
+ return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vmlsls_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int64x2_t)__builtin_neon_vmlsls_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+ {
+ return (uint32x4_t)__builtin_neon_vmlslu_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+ {
+ return (uint64x2_t)__builtin_neon_vmlslu_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vmulls_lanev4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vmulls_lanev2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmullu_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vmullu_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmulh_lanev8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmulh_lanev4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmulh_lanev4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmulh_lanev2si (__a, __b, __c);
+ }
+
+ #ifdef __ARM_FEATURE_QRDMX
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmlah_lanev8hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmlah_lanev4si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmlah_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmlah_lanev2si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmlsh_lanev8hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmlsh_lanev4si (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmlsh_lanev4hi (__a, __b, __c, __d);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmlsh_lanev2si (__a, __b, __c, __d);
+ }
+ #endif
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_n_s16 (int16x4_t __a, int16_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_n_s32 (int32x2_t __a, int32_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_n_f32 (float32x2_t __a, float32_t __b)
+ {
+ return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+ {
+ return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+ {
+ return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_n_s16 (int16x8_t __a, int16_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_n_s32 (int32x4_t __a, int32_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_n_f32 (float32x4_t __a, float32_t __b)
+ {
+ return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+ {
+ return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_n_s16 (int16x4_t __a, int16_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vmulls_nv4hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_n_s32 (int32x2_t __a, int32_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vmulls_nv2si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+ {
+ return (uint32x4_t)__builtin_neon_vmullu_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+ {
+ return (uint64x2_t)__builtin_neon_vmullu_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmull_n_s16 (int16x4_t __a, int16_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmull_n_s32 (int32x2_t __a, int32_t __b)
+ {
+ return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+ {
+ return (int16x8_t)__builtin_neon_vqrdmulh_nv8hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+ {
+ return (int32x4_t)__builtin_neon_vqrdmulh_nv4si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
+ {
+ return (int16x4_t)__builtin_neon_vqrdmulh_nv4hi (__a, (__builtin_neon_hi) __b);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
+ {
+ return (int32x2_t)__builtin_neon_vqrdmulh_nv2si (__a, (__builtin_neon_si) __b);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, (__builtin_neon_sf) __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+ return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+ return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, (__builtin_neon_sf) __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmlals_nv4hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vmlals_nv2si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmlalu_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+ return (uint64x2_t)__builtin_neon_vmlalu_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, (__builtin_neon_sf) __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+ return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+ return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, (__builtin_neon_sf) __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vmlsls_nv4hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vmlsls_nv2si (__a, __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vmlslu_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+ return (uint64x2_t)__builtin_neon_vmlslu_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c);
+@@ -7500,74 +8690,86 @@ vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
+ {
+ return (poly64x1_t)__builtin_neon_vextdi (__a, __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
+ {
+ return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+ {
+ return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+ {
+ return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+@@ -7575,290 +8777,338 @@ vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
+ {
+ return (poly64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+ {
+ return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+ {
+ return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+ {
+ return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_s8 (int8x8_t __a)
+ {
+ return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_s16 (int16x4_t __a)
+ {
+ return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_s32 (int32x2_t __a)
+ {
+ return (int32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_f32 (float32x2_t __a)
+ {
+ return (float32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_u8 (uint8x8_t __a)
+ {
+ return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_u16 (uint16x4_t __a)
+ {
+ return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_u32 (uint32x2_t __a)
+ {
+ return (uint32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_p8 (poly8x8_t __a)
+ {
+ return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64_p16 (poly16x4_t __a)
+ {
+ return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_s8 (int8x16_t __a)
+ {
+ return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_s16 (int16x8_t __a)
+ {
+ return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_s32 (int32x4_t __a)
+ {
+ return (int32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_f32 (float32x4_t __a)
+ {
+ return (float32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_u8 (uint8x16_t __a)
+ {
+ return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_u16 (uint16x8_t __a)
+ {
+ return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_u32 (uint32x4_t __a)
+ {
+ return (uint32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_p8 (poly8x16_t __a)
+ {
+ return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev64q_p16 (poly16x8_t __a)
+ {
+ return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32_s8 (int8x8_t __a)
+ {
+ return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32_s16 (int16x4_t __a)
+ {
+ return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32_u8 (uint8x8_t __a)
+ {
+ return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32_u16 (uint16x4_t __a)
+ {
+ return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32_p8 (poly8x8_t __a)
+ {
+ return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32_p16 (poly16x4_t __a)
+ {
+ return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32q_s8 (int8x16_t __a)
+ {
+ return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32q_s16 (int16x8_t __a)
+ {
+ return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32q_u8 (uint8x16_t __a)
+ {
+ return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32q_u16 (uint16x8_t __a)
+ {
+ return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32q_p8 (poly8x16_t __a)
+ {
+ return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev32q_p16 (poly16x8_t __a)
+ {
+ return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev16_s8 (int8x8_t __a)
+ {
+ return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev16_u8 (uint8x8_t __a)
+ {
+ return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev16_p8 (poly8x8_t __a)
+ {
+ return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev16q_s8 (int8x16_t __a)
+ {
+ return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev16q_u8 (uint8x16_t __a)
+ {
+ return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vrev16q_p8 (poly8x16_t __a)
+ {
+ return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+@@ -7866,74 +9116,86 @@ vrev16q_p8 (poly8x16_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_p64 (uint64x1_t __a, poly64x1_t __b, poly64x1_t __c)
+ {
+ return (poly64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+ return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+ return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+ return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
+ {
+ return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+ return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+ return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+ return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+ return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
+ {
+ return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
+ {
+ return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
+ {
+ return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+@@ -7941,74 +9203,86 @@ vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_p64 (uint64x2_t __a, poly64x2_t __b, poly64x2_t __c)
+ {
+ return (poly64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+ return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+ return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+ return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
+ {
+ return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+ return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+ return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+ return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+ return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
+ {
+ return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
+ {
+ return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
+ {
+ return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+@@ -8025,7 +9299,8 @@ vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
+ vector, and will itself be loaded in reverse order (again, relative to the
+ neon intrinsics view, i.e. that would result from a "vld1" instruction). */
+
+-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ int8x8x2_t __rv;
+@@ -8043,7 +9318,8 @@ vtrn_s8 (int8x8_t __a, int8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ int16x4x2_t __rv;
+@@ -8057,7 +9333,8 @@ vtrn_s16 (int16x4_t __a, int16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ uint8x8x2_t __rv;
+@@ -8075,7 +9352,8 @@ vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ uint16x4x2_t __rv;
+@@ -8089,7 +9367,8 @@ vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ poly8x8x2_t __rv;
+@@ -8107,7 +9386,8 @@ vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
+ {
+ poly16x4x2_t __rv;
+@@ -8121,7 +9401,8 @@ vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ int32x2x2_t __rv;
+@@ -8135,7 +9416,8 @@ vtrn_s32 (int32x2_t __a, int32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ float32x2x2_t __rv;
+@@ -8149,7 +9431,8 @@ vtrn_f32 (float32x2_t __a, float32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ uint32x2x2_t __rv;
+@@ -8163,7 +9446,8 @@ vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ int8x16x2_t __rv;
+@@ -8181,7 +9465,8 @@ vtrnq_s8 (int8x16_t __a, int8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ int16x8x2_t __rv;
+@@ -8199,7 +9484,8 @@ vtrnq_s16 (int16x8_t __a, int16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ int32x4x2_t __rv;
+@@ -8213,7 +9499,8 @@ vtrnq_s32 (int32x4_t __a, int32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ float32x4x2_t __rv;
+@@ -8227,7 +9514,8 @@ vtrnq_f32 (float32x4_t __a, float32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ uint8x16x2_t __rv;
+@@ -8245,7 +9533,8 @@ vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ uint16x8x2_t __rv;
+@@ -8263,7 +9552,8 @@ vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ uint32x4x2_t __rv;
+@@ -8277,7 +9567,8 @@ vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+ poly8x16x2_t __rv;
+@@ -8295,7 +9586,8 @@ vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
+ {
+ poly16x8x2_t __rv;
+@@ -8313,7 +9605,8 @@ vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ int8x8x2_t __rv;
+@@ -8331,7 +9624,8 @@ vzip_s8 (int8x8_t __a, int8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ int16x4x2_t __rv;
+@@ -8345,7 +9639,8 @@ vzip_s16 (int16x4_t __a, int16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ uint8x8x2_t __rv;
+@@ -8363,7 +9658,8 @@ vzip_u8 (uint8x8_t __a, uint8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ uint16x4x2_t __rv;
+@@ -8377,7 +9673,8 @@ vzip_u16 (uint16x4_t __a, uint16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ poly8x8x2_t __rv;
+@@ -8395,7 +9692,8 @@ vzip_p8 (poly8x8_t __a, poly8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_p16 (poly16x4_t __a, poly16x4_t __b)
+ {
+ poly16x4x2_t __rv;
+@@ -8409,7 +9707,8 @@ vzip_p16 (poly16x4_t __a, poly16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ int32x2x2_t __rv;
+@@ -8423,7 +9722,8 @@ vzip_s32 (int32x2_t __a, int32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ float32x2x2_t __rv;
+@@ -8437,7 +9737,8 @@ vzip_f32 (float32x2_t __a, float32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzip_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ uint32x2x2_t __rv;
+@@ -8451,7 +9752,8 @@ vzip_u32 (uint32x2_t __a, uint32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ int8x16x2_t __rv;
+@@ -8469,7 +9771,8 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ int16x8x2_t __rv;
+@@ -8487,7 +9790,8 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ int32x4x2_t __rv;
+@@ -8501,7 +9805,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ float32x4x2_t __rv;
+@@ -8515,7 +9820,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ uint8x16x2_t __rv;
+@@ -8533,7 +9839,8 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ uint16x8x2_t __rv;
+@@ -8551,7 +9858,8 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ uint32x4x2_t __rv;
+@@ -8565,7 +9873,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+ poly8x16x2_t __rv;
+@@ -8583,7 +9892,8 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
+ {
+ poly16x8x2_t __rv;
+@@ -8601,7 +9911,8 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ int8x8x2_t __rv;
+@@ -8619,7 +9930,8 @@ vuzp_s8 (int8x8_t __a, int8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ int16x4x2_t __rv;
+@@ -8633,7 +9945,8 @@ vuzp_s16 (int16x4_t __a, int16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ int32x2x2_t __rv;
+@@ -8647,7 +9960,8 @@ vuzp_s32 (int32x2_t __a, int32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_f32 (float32x2_t __a, float32x2_t __b)
+ {
+ float32x2x2_t __rv;
+@@ -8661,7 +9975,8 @@ vuzp_f32 (float32x2_t __a, float32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ uint8x8x2_t __rv;
+@@ -8679,7 +9994,8 @@ vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ uint16x4x2_t __rv;
+@@ -8693,7 +10009,8 @@ vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ uint32x2x2_t __rv;
+@@ -8707,7 +10024,8 @@ vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+ poly8x8x2_t __rv;
+@@ -8725,7 +10043,8 @@ vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
+ {
+ poly16x4x2_t __rv;
+@@ -8739,7 +10058,8 @@ vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ int8x16x2_t __rv;
+@@ -8757,7 +10077,8 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ int16x8x2_t __rv;
+@@ -8775,7 +10096,8 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ int32x4x2_t __rv;
+@@ -8789,7 +10111,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+ float32x4x2_t __rv;
+@@ -8803,7 +10126,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ uint8x16x2_t __rv;
+@@ -8821,7 +10145,8 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ uint16x8x2_t __rv;
+@@ -8839,7 +10164,8 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ uint32x4x2_t __rv;
+@@ -8853,7 +10179,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+ poly8x16x2_t __rv;
+@@ -8871,7 +10198,8 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
+ return __rv;
+ }
+
+-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
+ {
+ poly16x8x2_t __rv;
+@@ -8891,82 +10219,95 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_p64 (const poly64_t * __a)
+ {
+ return (poly64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_s8 (const int8_t * __a)
+ {
+ return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_s16 (const int16_t * __a)
+ {
+ return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_s32 (const int32_t * __a)
+ {
+ return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_s64 (const int64_t * __a)
+ {
+ return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_f16 (const float16_t * __a)
+ {
+ return __builtin_neon_vld1v4hf (__a);
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_f32 (const float32_t * __a)
+ {
+ return (float32x2_t)__builtin_neon_vld1v2sf ((const __builtin_neon_sf *) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_u8 (const uint8_t * __a)
+ {
+ return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_u16 (const uint16_t * __a)
+ {
+ return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_u32 (const uint32_t * __a)
+ {
+ return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_u64 (const uint64_t * __a)
+ {
+ return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_p8 (const poly8_t * __a)
+ {
+ return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_p16 (const poly16_t * __a)
+ {
+ return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+@@ -8974,144 +10315,167 @@ vld1_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_p64 (const poly64_t * __a)
+ {
+ return (poly64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_s8 (const int8_t * __a)
+ {
+ return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_s16 (const int16_t * __a)
+ {
+ return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_s32 (const int32_t * __a)
+ {
+ return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_s64 (const int64_t * __a)
+ {
+ return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_f16 (const float16_t * __a)
+ {
+ return __builtin_neon_vld1v8hf (__a);
+ }
+ #endif
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_f32 (const float32_t * __a)
+ {
+ return (float32x4_t)__builtin_neon_vld1v4sf ((const __builtin_neon_sf *) __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_u8 (const uint8_t * __a)
+ {
+ return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_u16 (const uint16_t * __a)
+ {
+ return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_u32 (const uint32_t * __a)
+ {
+ return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_u64 (const uint64_t * __a)
+ {
+ return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_p8 (const poly8_t * __a)
+ {
+ return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_p16 (const poly16_t * __a)
+ {
+ return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c)
+ {
+ return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c)
+ {
+ return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
+ {
+ return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_f16 (const float16_t * __a, float16x4_t __b, const int __c)
+ {
+ return vset_lane_f16 (*__a, __b, __c);
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
+ {
+ return (float32x2_t)__builtin_neon_vld1_lanev2sf ((const __builtin_neon_sf *) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c)
+ {
+ return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c)
+ {
+ return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c)
+ {
+ return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c)
+ {
+ return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
+ {
+ return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+@@ -9119,82 +10483,95 @@ vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_p64 (const poly64_t * __a, poly64x1_t __b, const int __c)
+ {
+ return (poly64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c)
+ {
+ return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c)
+ {
+ return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c)
+ {
+ return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c)
+ {
+ return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
+ {
+ return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c)
+ {
+ return vsetq_lane_f16 (*__a, __b, __c);
+ }
+ #endif
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
+ {
+ return (float32x4_t)__builtin_neon_vld1_lanev4sf ((const __builtin_neon_sf *) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c)
+ {
+ return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c)
+ {
+ return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c)
+ {
+ return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c)
+ {
+ return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
+ {
+ return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+@@ -9202,45 +10579,52 @@ vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_p64 (const poly64_t * __a, poly64x2_t __b, const int __c)
+ {
+ return (poly64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c)
+ {
+ return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c)
+ {
+ return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_s8 (const int8_t * __a)
+ {
+ return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_s16 (const int16_t * __a)
+ {
+ return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_s32 (const int32_t * __a)
+ {
+ return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_f16 (const float16_t * __a)
+ {
+ float16_t __f = *__a;
+@@ -9248,37 +10632,43 @@ vld1_dup_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_f32 (const float32_t * __a)
+ {
+ return (float32x2_t)__builtin_neon_vld1_dupv2sf ((const __builtin_neon_sf *) __a);
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_u8 (const uint8_t * __a)
+ {
+ return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_u16 (const uint16_t * __a)
+ {
+ return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_u32 (const uint32_t * __a)
+ {
+ return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_p8 (const poly8_t * __a)
+ {
+ return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_p16 (const poly16_t * __a)
+ {
+ return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+@@ -9286,45 +10676,52 @@ vld1_dup_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_p64 (const poly64_t * __a)
+ {
+ return (poly64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_s64 (const int64_t * __a)
+ {
+ return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1_dup_u64 (const uint64_t * __a)
+ {
+ return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_s8 (const int8_t * __a)
+ {
+ return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_s16 (const int16_t * __a)
+ {
+ return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_s32 (const int32_t * __a)
+ {
+ return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_f16 (const float16_t * __a)
+ {
+ float16_t __f = *__a;
+@@ -9332,37 +10729,43 @@ vld1q_dup_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_f32 (const float32_t * __a)
+ {
+ return (float32x4_t)__builtin_neon_vld1_dupv4sf ((const __builtin_neon_sf *) __a);
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_u8 (const uint8_t * __a)
+ {
+ return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_u16 (const uint16_t * __a)
+ {
+ return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_u32 (const uint32_t * __a)
+ {
+ return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_p8 (const poly8_t * __a)
+ {
+ return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_p16 (const poly16_t * __a)
+ {
+ return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+@@ -9370,20 +10773,23 @@ vld1q_dup_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_p64 (const poly64_t * __a)
+ {
+ return (poly64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_s64 (const int64_t * __a)
+ {
+ return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld1q_dup_u64 (const uint64_t * __a)
+ {
+ return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+@@ -9391,82 +10797,95 @@ vld1q_dup_u64 (const uint64_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_p64 (poly64_t * __a, poly64x1_t __b)
+ {
+ __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_s8 (int8_t * __a, int8x8_t __b)
+ {
+ __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_s16 (int16_t * __a, int16x4_t __b)
+ {
+ __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_s32 (int32_t * __a, int32x2_t __b)
+ {
+ __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_s64 (int64_t * __a, int64x1_t __b)
+ {
+ __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_f16 (float16_t * __a, float16x4_t __b)
+ {
+ __builtin_neon_vst1v4hf (__a, __b);
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_f32 (float32_t * __a, float32x2_t __b)
+ {
+ __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_u8 (uint8_t * __a, uint8x8_t __b)
+ {
+ __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_u16 (uint16_t * __a, uint16x4_t __b)
+ {
+ __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_u32 (uint32_t * __a, uint32x2_t __b)
+ {
+ __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_u64 (uint64_t * __a, uint64x1_t __b)
+ {
+ __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_p8 (poly8_t * __a, poly8x8_t __b)
+ {
+ __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_p16 (poly16_t * __a, poly16x4_t __b)
+ {
+ __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
+@@ -9474,144 +10893,167 @@ vst1_p16 (poly16_t * __a, poly16x4_t __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_p64 (poly64_t * __a, poly64x2_t __b)
+ {
+ __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_s8 (int8_t * __a, int8x16_t __b)
+ {
+ __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_s16 (int16_t * __a, int16x8_t __b)
+ {
+ __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_s32 (int32_t * __a, int32x4_t __b)
+ {
+ __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_s64 (int64_t * __a, int64x2_t __b)
+ {
+ __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_f16 (float16_t * __a, float16x8_t __b)
+ {
+ __builtin_neon_vst1v8hf (__a, __b);
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_f32 (float32_t * __a, float32x4_t __b)
+ {
+ __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_u8 (uint8_t * __a, uint8x16_t __b)
+ {
+ __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_u16 (uint16_t * __a, uint16x8_t __b)
+ {
+ __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_u32 (uint32_t * __a, uint32x4_t __b)
+ {
+ __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_u64 (uint64_t * __a, uint64x2_t __b)
+ {
+ __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_p8 (poly8_t * __a, poly8x16_t __b)
+ {
+ __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_p16 (poly16_t * __a, poly16x8_t __b)
+ {
+ __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_f16 (float16_t * __a, float16x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4hf (__a, __b, __c);
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+@@ -9619,82 +11061,95 @@ vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_p64 (poly64_t * __a, poly64x1_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c);
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_f16 (float16_t * __a, float16x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8hf (__a, __b, __c);
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+@@ -9702,26 +11157,30 @@ vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_p64 (poly64_t * __a, poly64x2_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c)
+ {
+ __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
+ }
+
+-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_s8 (const int8_t * __a)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9729,7 +11188,8 @@ vld2_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_s16 (const int16_t * __a)
+ {
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9737,7 +11197,8 @@ vld2_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_s32 (const int32_t * __a)
+ {
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9746,7 +11207,8 @@ vld2_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_f16 (const float16_t * __a)
+ {
+ union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9755,7 +11217,8 @@ vld2_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_f32 (const float32_t * __a)
+ {
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9763,7 +11226,8 @@ vld2_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_u8 (const uint8_t * __a)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9771,7 +11235,8 @@ vld2_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_u16 (const uint16_t * __a)
+ {
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9779,7 +11244,8 @@ vld2_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_u32 (const uint32_t * __a)
+ {
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9787,7 +11253,8 @@ vld2_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_p8 (const poly8_t * __a)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9795,7 +11262,8 @@ vld2_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_p16 (const poly16_t * __a)
+ {
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9805,7 +11273,8 @@ vld2_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_p64 (const poly64_t * __a)
+ {
+ union { poly64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9814,7 +11283,8 @@ vld2_p64 (const poly64_t * __a)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_s64 (const int64_t * __a)
+ {
+ union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9822,7 +11292,8 @@ vld2_s64 (const int64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_u64 (const uint64_t * __a)
+ {
+ union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -9830,7 +11301,8 @@ vld2_u64 (const uint64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_s8 (const int8_t * __a)
+ {
+ union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9838,7 +11310,8 @@ vld2q_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_s16 (const int16_t * __a)
+ {
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9846,7 +11319,8 @@ vld2q_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_s32 (const int32_t * __a)
+ {
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9855,7 +11329,8 @@ vld2q_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_f16 (const float16_t * __a)
+ {
+ union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9864,7 +11339,8 @@ vld2q_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_f32 (const float32_t * __a)
+ {
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9872,7 +11348,8 @@ vld2q_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_u8 (const uint8_t * __a)
+ {
+ union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9880,7 +11357,8 @@ vld2q_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_u16 (const uint16_t * __a)
+ {
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9888,7 +11366,8 @@ vld2q_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_u32 (const uint32_t * __a)
+ {
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9896,7 +11375,8 @@ vld2q_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_p8 (const poly8_t * __a)
+ {
+ union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9904,7 +11384,8 @@ vld2q_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_p16 (const poly16_t * __a)
+ {
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+@@ -9912,7 +11393,8 @@ vld2q_p16 (const poly16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9921,7 +11403,8 @@ vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
+ {
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9930,7 +11413,8 @@ vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
+ {
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9940,7 +11424,8 @@ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
+ {
+ union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9950,7 +11435,8 @@ vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
+ {
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9959,7 +11445,8 @@ vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9968,7 +11455,8 @@ vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
+ {
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9977,7 +11465,8 @@ vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
+ {
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9986,7 +11475,8 @@ vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -9995,7 +11485,8 @@ vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
+ {
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10004,7 +11495,8 @@ vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
+ {
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10013,7 +11505,8 @@ vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
+ {
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10023,7 +11516,8 @@ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
+ {
+ union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10033,7 +11527,8 @@ vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
+ {
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10042,7 +11537,8 @@ vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
+ {
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10051,7 +11547,8 @@ vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
+ {
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10060,7 +11557,8 @@ vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
+ {
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10069,7 +11567,8 @@ vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_s8 (const int8_t * __a)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10077,7 +11576,8 @@ vld2_dup_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_s16 (const int16_t * __a)
+ {
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10085,7 +11585,8 @@ vld2_dup_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_s32 (const int32_t * __a)
+ {
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10094,7 +11595,8 @@ vld2_dup_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_f16 (const float16_t * __a)
+ {
+ union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10103,7 +11605,8 @@ vld2_dup_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_f32 (const float32_t * __a)
+ {
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10111,7 +11614,8 @@ vld2_dup_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_u8 (const uint8_t * __a)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10119,7 +11623,8 @@ vld2_dup_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_u16 (const uint16_t * __a)
+ {
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10127,7 +11632,8 @@ vld2_dup_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_u32 (const uint32_t * __a)
+ {
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10135,7 +11641,8 @@ vld2_dup_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_p8 (const poly8_t * __a)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10143,7 +11650,8 @@ vld2_dup_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_p16 (const poly16_t * __a)
+ {
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10153,7 +11661,8 @@ vld2_dup_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_p64 (const poly64_t * __a)
+ {
+ union { poly64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10162,7 +11671,8 @@ vld2_dup_p64 (const poly64_t * __a)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_s64 (const int64_t * __a)
+ {
+ union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10170,7 +11680,8 @@ vld2_dup_s64 (const int64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld2_dup_u64 (const uint64_t * __a)
+ {
+ union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+@@ -10178,21 +11689,24 @@ vld2_dup_u64 (const uint64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_s8 (int8_t * __a, int8x8x2_t __b)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_s16 (int16_t * __a, int16x4x2_t __b)
+ {
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_s32 (int32_t * __a, int32x2x2_t __b)
+ {
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10200,7 +11714,8 @@ vst2_s32 (int32_t * __a, int32x2x2_t __b)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_f16 (float16_t * __a, float16x4x2_t __b)
+ {
+ union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10208,42 +11723,48 @@ vst2_f16 (float16_t * __a, float16x4x2_t __b)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_f32 (float32_t * __a, float32x2x2_t __b)
+ {
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v2sf ((__builtin_neon_sf *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_u16 (uint16_t * __a, uint16x4x2_t __b)
+ {
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_u32 (uint32_t * __a, uint32x2x2_t __b)
+ {
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_p8 (poly8_t * __a, poly8x8x2_t __b)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
+ {
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10252,7 +11773,8 @@ vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_p64 (poly64_t * __a, poly64x1x2_t __b)
+ {
+ union { poly64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10260,35 +11782,40 @@ vst2_p64 (poly64_t * __a, poly64x1x2_t __b)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_s64 (int64_t * __a, int64x1x2_t __b)
+ {
+ union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_u64 (uint64_t * __a, uint64x1x2_t __b)
+ {
+ union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_s8 (int8_t * __a, int8x16x2_t __b)
+ {
+ union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_s16 (int16_t * __a, int16x8x2_t __b)
+ {
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_s32 (int32_t * __a, int32x4x2_t __b)
+ {
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10296,7 +11823,8 @@ vst2q_s32 (int32_t * __a, int32x4x2_t __b)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_f16 (float16_t * __a, float16x8x2_t __b)
+ {
+ union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10304,63 +11832,72 @@ vst2q_f16 (float16_t * __a, float16x8x2_t __b)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_f32 (float32_t * __a, float32x4x2_t __b)
+ {
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v4sf ((__builtin_neon_sf *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_u8 (uint8_t * __a, uint8x16x2_t __b)
+ {
+ union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_u16 (uint16_t * __a, uint16x8x2_t __b)
+ {
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_u32 (uint32_t * __a, uint32x4x2_t __b)
+ {
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_p8 (poly8_t * __a, poly8x16x2_t __b)
+ {
+ union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_p16 (poly16_t * __a, poly16x8x2_t __b)
+ {
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c)
+ {
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c)
+ {
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
+ {
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10368,7 +11905,8 @@ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
+ {
+ union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+@@ -10376,56 +11914,64 @@ vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
+ {
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c)
+ {
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c)
+ {
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c)
+ {
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c)
+ {
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c)
+ {
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c)
+ {
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
+ {
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10433,7 +11979,8 @@ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
+ {
+ union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -10441,35 +11988,40 @@ vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
+ {
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c)
+ {
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c)
+ {
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c)
+ {
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_s8 (const int8_t * __a)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10477,7 +12029,8 @@ vld3_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_s16 (const int16_t * __a)
+ {
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10485,7 +12038,8 @@ vld3_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_s32 (const int32_t * __a)
+ {
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10494,7 +12048,8 @@ vld3_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_f16 (const float16_t * __a)
+ {
+ union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10503,7 +12058,8 @@ vld3_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_f32 (const float32_t * __a)
+ {
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10511,7 +12067,8 @@ vld3_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_u8 (const uint8_t * __a)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10519,7 +12076,8 @@ vld3_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_u16 (const uint16_t * __a)
+ {
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10527,7 +12085,8 @@ vld3_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_u32 (const uint32_t * __a)
+ {
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10535,7 +12094,8 @@ vld3_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_p8 (const poly8_t * __a)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10543,7 +12103,8 @@ vld3_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_p16 (const poly16_t * __a)
+ {
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10553,7 +12114,8 @@ vld3_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_p64 (const poly64_t * __a)
+ {
+ union { poly64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10562,7 +12124,8 @@ vld3_p64 (const poly64_t * __a)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_s64 (const int64_t * __a)
+ {
+ union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10570,7 +12133,8 @@ vld3_s64 (const int64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_u64 (const uint64_t * __a)
+ {
+ union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10578,7 +12142,8 @@ vld3_u64 (const uint64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_s8 (const int8_t * __a)
+ {
+ union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10586,7 +12151,8 @@ vld3q_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_s16 (const int16_t * __a)
+ {
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10594,7 +12160,8 @@ vld3q_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_s32 (const int32_t * __a)
+ {
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10603,7 +12170,8 @@ vld3q_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_f16 (const float16_t * __a)
+ {
+ union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10612,7 +12180,8 @@ vld3q_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_f32 (const float32_t * __a)
+ {
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10620,7 +12189,8 @@ vld3q_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_u8 (const uint8_t * __a)
+ {
+ union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10628,7 +12198,8 @@ vld3q_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_u16 (const uint16_t * __a)
+ {
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10636,7 +12207,8 @@ vld3q_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_u32 (const uint32_t * __a)
+ {
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10644,7 +12216,8 @@ vld3q_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_p8 (const poly8_t * __a)
+ {
+ union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10652,7 +12225,8 @@ vld3q_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_p16 (const poly16_t * __a)
+ {
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+@@ -10660,7 +12234,8 @@ vld3q_p16 (const poly16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10669,7 +12244,8 @@ vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
+ {
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10678,7 +12254,8 @@ vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
+ {
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10688,7 +12265,8 @@ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
+ {
+ union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10698,7 +12276,8 @@ vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
+ {
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10707,7 +12286,8 @@ vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10716,7 +12296,8 @@ vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
+ {
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10725,7 +12306,8 @@ vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
+ {
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10734,7 +12316,8 @@ vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10743,7 +12326,8 @@ vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
+ {
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10752,7 +12336,8 @@ vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
+ {
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10761,7 +12346,8 @@ vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
+ {
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10771,7 +12357,8 @@ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
+ {
+ union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10781,7 +12368,8 @@ vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
+ {
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10790,7 +12378,8 @@ vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
+ {
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10799,7 +12388,8 @@ vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
+ {
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10808,7 +12398,8 @@ vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
+ {
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -10817,7 +12408,8 @@ vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_s8 (const int8_t * __a)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10825,7 +12417,8 @@ vld3_dup_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_s16 (const int16_t * __a)
+ {
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10833,7 +12426,8 @@ vld3_dup_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_s32 (const int32_t * __a)
+ {
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10842,7 +12436,8 @@ vld3_dup_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_f16 (const float16_t * __a)
+ {
+ union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10851,7 +12446,8 @@ vld3_dup_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_f32 (const float32_t * __a)
+ {
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10859,7 +12455,8 @@ vld3_dup_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_u8 (const uint8_t * __a)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10867,7 +12464,8 @@ vld3_dup_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_u16 (const uint16_t * __a)
+ {
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10875,7 +12473,8 @@ vld3_dup_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_u32 (const uint32_t * __a)
+ {
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10883,7 +12482,8 @@ vld3_dup_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_p8 (const poly8_t * __a)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10891,7 +12491,8 @@ vld3_dup_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_p16 (const poly16_t * __a)
+ {
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10901,7 +12502,8 @@ vld3_dup_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_p64 (const poly64_t * __a)
+ {
+ union { poly64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10910,7 +12512,8 @@ vld3_dup_p64 (const poly64_t * __a)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_s64 (const int64_t * __a)
+ {
+ union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10918,7 +12521,8 @@ vld3_dup_s64 (const int64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld3_dup_u64 (const uint64_t * __a)
+ {
+ union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+@@ -10926,21 +12530,24 @@ vld3_dup_u64 (const uint64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_s8 (int8_t * __a, int8x8x3_t __b)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_s16 (int16_t * __a, int16x4x3_t __b)
+ {
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_s32 (int32_t * __a, int32x2x3_t __b)
+ {
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10948,7 +12555,8 @@ vst3_s32 (int32_t * __a, int32x2x3_t __b)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_f16 (float16_t * __a, float16x4x3_t __b)
+ {
+ union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -10956,42 +12564,48 @@ vst3_f16 (float16_t * __a, float16x4x3_t __b)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_f32 (float32_t * __a, float32x2x3_t __b)
+ {
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v2sf ((__builtin_neon_sf *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_u8 (uint8_t * __a, uint8x8x3_t __b)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_u16 (uint16_t * __a, uint16x4x3_t __b)
+ {
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_u32 (uint32_t * __a, uint32x2x3_t __b)
+ {
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_p8 (poly8_t * __a, poly8x8x3_t __b)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
+ {
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -11000,7 +12614,8 @@ vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_p64 (poly64_t * __a, poly64x1x3_t __b)
+ {
+ union { poly64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -11008,35 +12623,40 @@ vst3_p64 (poly64_t * __a, poly64x1x3_t __b)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_s64 (int64_t * __a, int64x1x3_t __b)
+ {
+ union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_u64 (uint64_t * __a, uint64x1x3_t __b)
+ {
+ union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_s8 (int8_t * __a, int8x16x3_t __b)
+ {
+ union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_s16 (int16_t * __a, int16x8x3_t __b)
+ {
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_s32 (int32_t * __a, int32x4x3_t __b)
+ {
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -11044,7 +12664,8 @@ vst3q_s32 (int32_t * __a, int32x4x3_t __b)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_f16 (float16_t * __a, float16x8x3_t __b)
+ {
+ union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -11052,63 +12673,72 @@ vst3q_f16 (float16_t * __a, float16x8x3_t __b)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_f32 (float32_t * __a, float32x4x3_t __b)
+ {
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v4sf ((__builtin_neon_sf *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_u8 (uint8_t * __a, uint8x16x3_t __b)
+ {
+ union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_u16 (uint16_t * __a, uint16x8x3_t __b)
+ {
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_u32 (uint32_t * __a, uint32x4x3_t __b)
+ {
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_p8 (poly8_t * __a, poly8x16x3_t __b)
+ {
+ union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_p16 (poly16_t * __a, poly16x8x3_t __b)
+ {
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c)
+ {
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c)
+ {
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
+ {
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -11116,7 +12746,8 @@ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
+ {
+ union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+@@ -11124,56 +12755,64 @@ vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
+ {
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c)
+ {
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c)
+ {
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c)
+ {
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c)
+ {
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c)
+ {
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c)
+ {
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
+ {
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -11181,7 +12820,8 @@ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
+ {
+ union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+@@ -11189,35 +12829,40 @@ vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
+ {
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c)
+ {
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c)
+ {
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c)
+ {
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_s8 (const int8_t * __a)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11225,7 +12870,8 @@ vld4_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_s16 (const int16_t * __a)
+ {
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11233,7 +12879,8 @@ vld4_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_s32 (const int32_t * __a)
+ {
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11242,7 +12889,8 @@ vld4_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_f16 (const float16_t * __a)
+ {
+ union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11251,7 +12899,8 @@ vld4_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_f32 (const float32_t * __a)
+ {
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11259,7 +12908,8 @@ vld4_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_u8 (const uint8_t * __a)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11267,7 +12917,8 @@ vld4_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_u16 (const uint16_t * __a)
+ {
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11275,7 +12926,8 @@ vld4_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_u32 (const uint32_t * __a)
+ {
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11283,7 +12935,8 @@ vld4_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_p8 (const poly8_t * __a)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11291,7 +12944,8 @@ vld4_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_p16 (const poly16_t * __a)
+ {
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11301,7 +12955,8 @@ vld4_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_p64 (const poly64_t * __a)
+ {
+ union { poly64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11310,7 +12965,8 @@ vld4_p64 (const poly64_t * __a)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_s64 (const int64_t * __a)
+ {
+ union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11318,7 +12974,8 @@ vld4_s64 (const int64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_u64 (const uint64_t * __a)
+ {
+ union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11326,7 +12983,8 @@ vld4_u64 (const uint64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_s8 (const int8_t * __a)
+ {
+ union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11334,7 +12992,8 @@ vld4q_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_s16 (const int16_t * __a)
+ {
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11342,7 +13001,8 @@ vld4q_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_s32 (const int32_t * __a)
+ {
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11351,7 +13011,8 @@ vld4q_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_f16 (const float16_t * __a)
+ {
+ union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11360,7 +13021,8 @@ vld4q_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_f32 (const float32_t * __a)
+ {
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11368,7 +13030,8 @@ vld4q_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_u8 (const uint8_t * __a)
+ {
+ union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11376,7 +13039,8 @@ vld4q_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_u16 (const uint16_t * __a)
+ {
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11384,7 +13048,8 @@ vld4q_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_u32 (const uint32_t * __a)
+ {
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11392,7 +13057,8 @@ vld4q_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_p8 (const poly8_t * __a)
+ {
+ union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11400,7 +13066,8 @@ vld4q_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_p16 (const poly16_t * __a)
+ {
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+@@ -11408,7 +13075,8 @@ vld4q_p16 (const poly16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11417,7 +13085,8 @@ vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
+ {
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11426,7 +13095,8 @@ vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
+ {
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11436,7 +13106,8 @@ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
+ {
+ union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11447,7 +13118,8 @@ vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
+ {
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11456,7 +13128,8 @@ vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11465,7 +13138,8 @@ vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
+ {
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11474,7 +13148,8 @@ vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
+ {
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11483,7 +13158,8 @@ vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11492,7 +13168,8 @@ vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
+ {
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11501,7 +13178,8 @@ vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
+ {
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11510,7 +13188,8 @@ vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
+ {
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11520,7 +13199,8 @@ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
+ {
+ union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11531,7 +13211,8 @@ vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
+ {
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11540,7 +13221,8 @@ vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
+ {
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11549,7 +13231,8 @@ vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
+ {
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11558,7 +13241,8 @@ vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
+ {
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11567,7 +13251,8 @@ vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_s8 (const int8_t * __a)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11575,7 +13260,8 @@ vld4_dup_s8 (const int8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_s16 (const int16_t * __a)
+ {
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11583,7 +13269,8 @@ vld4_dup_s16 (const int16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_s32 (const int32_t * __a)
+ {
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11592,7 +13279,8 @@ vld4_dup_s32 (const int32_t * __a)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_f16 (const float16_t * __a)
+ {
+ union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11601,7 +13289,8 @@ vld4_dup_f16 (const float16_t * __a)
+ }
+ #endif
+
+-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_f32 (const float32_t * __a)
+ {
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11609,7 +13298,8 @@ vld4_dup_f32 (const float32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_u8 (const uint8_t * __a)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11617,7 +13307,8 @@ vld4_dup_u8 (const uint8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_u16 (const uint16_t * __a)
+ {
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11625,7 +13316,8 @@ vld4_dup_u16 (const uint16_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_u32 (const uint32_t * __a)
+ {
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11633,7 +13325,8 @@ vld4_dup_u32 (const uint32_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_p8 (const poly8_t * __a)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11641,7 +13334,8 @@ vld4_dup_p8 (const poly8_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_p16 (const poly16_t * __a)
+ {
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11651,7 +13345,8 @@ vld4_dup_p16 (const poly16_t * __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_p64 (const poly64_t * __a)
+ {
+ union { poly64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11660,7 +13355,8 @@ vld4_dup_p64 (const poly64_t * __a)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_s64 (const int64_t * __a)
+ {
+ union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11668,7 +13364,8 @@ vld4_dup_s64 (const int64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vld4_dup_u64 (const uint64_t * __a)
+ {
+ union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+@@ -11676,21 +13373,24 @@ vld4_dup_u64 (const uint64_t * __a)
+ return __rv.__i;
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_s8 (int8_t * __a, int8x8x4_t __b)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_s16 (int16_t * __a, int16x4x4_t __b)
+ {
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_s32 (int32_t * __a, int32x2x4_t __b)
+ {
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11698,7 +13398,8 @@ vst4_s32 (int32_t * __a, int32x2x4_t __b)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_f16 (float16_t * __a, float16x4x4_t __b)
+ {
+ union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11706,42 +13407,48 @@ vst4_f16 (float16_t * __a, float16x4x4_t __b)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_f32 (float32_t * __a, float32x2x4_t __b)
+ {
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v2sf ((__builtin_neon_sf *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_u16 (uint16_t * __a, uint16x4x4_t __b)
+ {
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_u32 (uint32_t * __a, uint32x2x4_t __b)
+ {
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_p8 (poly8_t * __a, poly8x8x4_t __b)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
+ {
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11750,7 +13457,8 @@ vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_p64 (poly64_t * __a, poly64x1x4_t __b)
+ {
+ union { poly64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11758,35 +13466,40 @@ vst4_p64 (poly64_t * __a, poly64x1x4_t __b)
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_s64 (int64_t * __a, int64x1x4_t __b)
+ {
+ union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_u64 (uint64_t * __a, uint64x1x4_t __b)
+ {
+ union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_s8 (int8_t * __a, int8x16x4_t __b)
+ {
+ union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_s16 (int16_t * __a, int16x8x4_t __b)
+ {
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_s32 (int32_t * __a, int32x4x4_t __b)
+ {
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11794,7 +13507,8 @@ vst4q_s32 (int32_t * __a, int32x4x4_t __b)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_f16 (float16_t * __a, float16x8x4_t __b)
+ {
+ union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11802,63 +13516,72 @@ vst4q_f16 (float16_t * __a, float16x8x4_t __b)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_f32 (float32_t * __a, float32x4x4_t __b)
+ {
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v4sf ((__builtin_neon_sf *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_u8 (uint8_t * __a, uint8x16x4_t __b)
+ {
+ union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_u16 (uint16_t * __a, uint16x8x4_t __b)
+ {
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_u32 (uint32_t * __a, uint32x4x4_t __b)
+ {
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_p8 (poly8_t * __a, poly8x16x4_t __b)
+ {
+ union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_p16 (poly16_t * __a, poly16x8x4_t __b)
+ {
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c)
+ {
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c)
+ {
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
+ {
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11866,7 +13589,8 @@ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
+ {
+ union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+@@ -11874,56 +13598,64 @@ vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
+ {
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c)
+ {
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c)
+ {
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c)
+ {
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c)
+ {
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c)
+ {
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c)
+ {
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
+ {
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11931,7 +13663,8 @@ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
+ {
+ union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+@@ -11939,529 +13672,616 @@ vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
+ }
+ #endif
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
+ {
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c)
+ {
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c)
+ {
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c)
+ {
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vand_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vandq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a & __b;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorr_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a | __b;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veor_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ veorq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a ^ __b;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbic_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a & ~__b;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_s8 (int8x8_t __a, int8x8_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_s16 (int16x4_t __a, int16x4_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_s32 (int32x2_t __a, int32x2_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_s64 (int64x1_t __a, int64x1_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vorn_u64 (uint64x1_t __a, uint64x1_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vornq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+ return __a | ~__b;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_p16 (poly16x4_t __a)
+ {
+ return (poly8x8_t) __a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_f16 (float16x4_t __a)
+ {
+ return (poly8x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_f32 (float32x2_t __a)
+ {
+ return (poly8x8_t)__a;
+@@ -12469,76 +14289,88 @@ vreinterpret_p8_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_p64 (poly64x1_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_s64 (int64x1_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_u64 (uint64x1_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_s8 (int8x8_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_s16 (int16x4_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_s32 (int32x2_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_u8 (uint8x8_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_u16 (uint16x4_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p8_u32 (uint32x2_t __a)
+ {
+ return (poly8x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_p8 (poly8x8_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_f16 (float16x4_t __a)
+ {
+ return (poly16x4_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_f32 (float32x2_t __a)
+ {
+ return (poly16x4_t)__a;
+@@ -12546,63 +14378,73 @@ vreinterpret_p16_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_p64 (poly64x1_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_s64 (int64x1_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_u64 (uint64x1_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_s8 (int8x8_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_s16 (int16x4_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_s32 (int32x2_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_u8 (uint8x8_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_u16 (uint16x4_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p16_u32 (uint32x2_t __a)
+ {
+ return (poly16x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_p8 (poly8x8_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12610,7 +14452,8 @@ vreinterpret_f16_p8 (poly8x8_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_p16 (poly16x4_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12618,7 +14461,8 @@ vreinterpret_f16_p16 (poly16x4_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_f32 (float32x2_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12628,7 +14472,8 @@ vreinterpret_f16_f32 (float32x2_t __a)
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_p64 (poly64x1_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12637,7 +14482,8 @@ vreinterpret_f16_p64 (poly64x1_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_s64 (int64x1_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12645,7 +14491,8 @@ vreinterpret_f16_s64 (int64x1_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_u64 (uint64x1_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12653,7 +14500,8 @@ vreinterpret_f16_u64 (uint64x1_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_s8 (int8x8_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12661,7 +14509,8 @@ vreinterpret_f16_s8 (int8x8_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_s16 (int16x4_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12669,7 +14518,8 @@ vreinterpret_f16_s16 (int16x4_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_s32 (int32x2_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12677,7 +14527,8 @@ vreinterpret_f16_s32 (int32x2_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_u8 (uint8x8_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12685,7 +14536,8 @@ vreinterpret_f16_u8 (uint8x8_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_u16 (uint16x4_t __a)
+ {
+ return (float16x4_t) __a;
+@@ -12693,27 +14545,31 @@ vreinterpret_f16_u16 (uint16x4_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f16_u32 (uint32x2_t __a)
+ {
+ return (float16x4_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_p8 (poly8x8_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_p16 (poly16x4_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_f16 (float16x4_t __a)
+ {
+ return (float32x2_t) __a;
+@@ -12722,56 +14578,65 @@ vreinterpret_f32_f16 (float16x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_p64 (poly64x1_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_s64 (int64x1_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_u64 (uint64x1_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_s8 (int8x8_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_s16 (int16x4_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_s32 (int32x2_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_u8 (uint8x8_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_u16 (uint16x4_t __a)
+ {
+ return (float32x2_t)__a;
+ }
+
+-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_f32_u32 (uint32x2_t __a)
+ {
+ return (float32x2_t)__a;
+@@ -12779,102 +14644,118 @@ vreinterpret_f32_u32 (uint32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_p8 (poly8x8_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_p16 (poly16x4_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_f16 (float16x4_t __a)
+ {
+ return (poly64x1_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_f32 (float32x2_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_s64 (int64x1_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_u64 (uint64x1_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_s8 (int8x8_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_s16 (int16x4_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_s32 (int32x2_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_u8 (uint8x8_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_u16 (uint16x4_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+-__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_p64_u32 (uint32x2_t __a)
+ {
+ return (poly64x1_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_p8 (poly8x8_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_p16 (poly16x4_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_f16 (float16x4_t __a)
+ {
+ return (int64x1_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_f32 (float32x2_t __a)
+ {
+ return (int64x1_t)__a;
+@@ -12882,76 +14763,88 @@ vreinterpret_s64_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_p64 (poly64x1_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_u64 (uint64x1_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_s8 (int8x8_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_s16 (int16x4_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_s32 (int32x2_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_u8 (uint8x8_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_u16 (uint16x4_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s64_u32 (uint32x2_t __a)
+ {
+ return (int64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_p8 (poly8x8_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_p16 (poly16x4_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_f16 (float16x4_t __a)
+ {
+ return (uint64x1_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_f32 (float32x2_t __a)
+ {
+ return (uint64x1_t)__a;
+@@ -12959,76 +14852,88 @@ vreinterpret_u64_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_p64 (poly64x1_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_s64 (int64x1_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_s8 (int8x8_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_s16 (int16x4_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_s32 (int32x2_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_u8 (uint8x8_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_u16 (uint16x4_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u64_u32 (uint32x2_t __a)
+ {
+ return (uint64x1_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_p8 (poly8x8_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_p16 (poly16x4_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_f16 (float16x4_t __a)
+ {
+ return (int8x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_f32 (float32x2_t __a)
+ {
+ return (int8x8_t)__a;
+@@ -13036,76 +14941,88 @@ vreinterpret_s8_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_p64 (poly64x1_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_s64 (int64x1_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_u64 (uint64x1_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_s16 (int16x4_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_s32 (int32x2_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_u8 (uint8x8_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_u16 (uint16x4_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s8_u32 (uint32x2_t __a)
+ {
+ return (int8x8_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_p8 (poly8x8_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_p16 (poly16x4_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_f16 (float16x4_t __a)
+ {
+ return (int16x4_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_f32 (float32x2_t __a)
+ {
+ return (int16x4_t)__a;
+@@ -13113,76 +15030,88 @@ vreinterpret_s16_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_p64 (poly64x1_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_s64 (int64x1_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_u64 (uint64x1_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_s8 (int8x8_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_s32 (int32x2_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_u8 (uint8x8_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_u16 (uint16x4_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s16_u32 (uint32x2_t __a)
+ {
+ return (int16x4_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_p8 (poly8x8_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_p16 (poly16x4_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_f16 (float16x4_t __a)
+ {
+ return (int32x2_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_f32 (float32x2_t __a)
+ {
+ return (int32x2_t)__a;
+@@ -13190,76 +15119,88 @@ vreinterpret_s32_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_p64 (poly64x1_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_s64 (int64x1_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_u64 (uint64x1_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_s8 (int8x8_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_s16 (int16x4_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_u8 (uint8x8_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_u16 (uint16x4_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_s32_u32 (uint32x2_t __a)
+ {
+ return (int32x2_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_p8 (poly8x8_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_p16 (poly16x4_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_f16 (float16x4_t __a)
+ {
+ return (uint8x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_f32 (float32x2_t __a)
+ {
+ return (uint8x8_t)__a;
+@@ -13267,76 +15208,88 @@ vreinterpret_u8_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_p64 (poly64x1_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_s64 (int64x1_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_u64 (uint64x1_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_s8 (int8x8_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_s16 (int16x4_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_s32 (int32x2_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_u16 (uint16x4_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u8_u32 (uint32x2_t __a)
+ {
+ return (uint8x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_p8 (poly8x8_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_p16 (poly16x4_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_f16 (float16x4_t __a)
+ {
+ return (uint16x4_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_f32 (float32x2_t __a)
+ {
+ return (uint16x4_t)__a;
+@@ -13344,76 +15297,88 @@ vreinterpret_u16_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_p64 (poly64x1_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_s64 (int64x1_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_u64 (uint64x1_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_s8 (int8x8_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_s16 (int16x4_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_s32 (int32x2_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_u8 (uint8x8_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u16_u32 (uint32x2_t __a)
+ {
+ return (uint16x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_p8 (poly8x8_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_p16 (poly16x4_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_f16 (float16x4_t __a)
+ {
+ return (uint32x2_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_f32 (float32x2_t __a)
+ {
+ return (uint32x2_t)__a;
+@@ -13421,70 +15386,81 @@ vreinterpret_u32_f32 (float32x2_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_p64 (poly64x1_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_s64 (int64x1_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_u64 (uint64x1_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_s8 (int8x8_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_s16 (int16x4_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_s32 (int32x2_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_u8 (uint8x8_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpret_u32_u16 (uint16x4_t __a)
+ {
+ return (uint32x2_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_p16 (poly16x8_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_f16 (float16x8_t __a)
+ {
+ return (poly8x16_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_f32 (float32x4_t __a)
+ {
+ return (poly8x16_t)__a;
+@@ -13492,83 +15468,96 @@ vreinterpretq_p8_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_p64 (poly64x2_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_p128 (poly128_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_s64 (int64x2_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_u64 (uint64x2_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_s8 (int8x16_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_s16 (int16x8_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_s32 (int32x4_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_u8 (uint8x16_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_u16 (uint16x8_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p8_u32 (uint32x4_t __a)
+ {
+ return (poly8x16_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_p8 (poly8x16_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_f16 (float16x8_t __a)
+ {
+ return (poly16x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_f32 (float32x4_t __a)
+ {
+ return (poly16x8_t)__a;
+@@ -13576,69 +15565,80 @@ vreinterpretq_p16_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_p64 (poly64x2_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_p128 (poly128_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_s64 (int64x2_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_u64 (uint64x2_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_s8 (int8x16_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_s16 (int16x8_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_s32 (int32x4_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_u8 (uint8x16_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_u16 (uint16x8_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p16_u32 (uint32x4_t __a)
+ {
+ return (poly16x8_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_p8 (poly8x16_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13646,7 +15646,8 @@ vreinterpretq_f16_p8 (poly8x16_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_p16 (poly16x8_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13654,7 +15655,8 @@ vreinterpretq_f16_p16 (poly16x8_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_f32 (float32x4_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13665,7 +15667,8 @@ vreinterpretq_f16_f32 (float32x4_t __a)
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_p64 (poly64x2_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13673,7 +15676,8 @@ vreinterpretq_f16_p64 (poly64x2_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_p128 (poly128_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13683,7 +15687,8 @@ vreinterpretq_f16_p128 (poly128_t __a)
+ #pragma GCC pop_options
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_s64 (int64x2_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13691,7 +15696,8 @@ vreinterpretq_f16_s64 (int64x2_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_u64 (uint64x2_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13699,7 +15705,8 @@ vreinterpretq_f16_u64 (uint64x2_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_s8 (int8x16_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13707,7 +15714,8 @@ vreinterpretq_f16_s8 (int8x16_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_s16 (int16x8_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13715,7 +15723,8 @@ vreinterpretq_f16_s16 (int16x8_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_s32 (int32x4_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13723,7 +15732,8 @@ vreinterpretq_f16_s32 (int32x4_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_u8 (uint8x16_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13731,7 +15741,8 @@ vreinterpretq_f16_u8 (uint8x16_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_u16 (uint16x8_t __a)
+ {
+ return (float16x8_t) __a;
+@@ -13739,27 +15750,31 @@ vreinterpretq_f16_u16 (uint16x8_t __a)
+ #endif
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f16_u32 (uint32x4_t __a)
+ {
+ return (float16x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_p8 (poly8x16_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_p16 (poly16x8_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_f16 (float16x8_t __a)
+ {
+ return (float32x4_t) __a;
+@@ -13768,62 +15783,72 @@ vreinterpretq_f32_f16 (float16x8_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_p64 (poly64x2_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_p128 (poly128_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_s64 (int64x2_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_u64 (uint64x2_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_s8 (int8x16_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_s16 (int16x8_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_s32 (int32x4_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_u8 (uint8x16_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_u16 (uint16x8_t __a)
+ {
+ return (float32x4_t)__a;
+ }
+
+-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_f32_u32 (uint32x4_t __a)
+ {
+ return (float32x4_t)__a;
+@@ -13831,188 +15856,218 @@ vreinterpretq_f32_u32 (uint32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_p8 (poly8x16_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_p16 (poly16x8_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_f16 (float16x8_t __a)
+ {
+ return (poly64x2_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_f32 (float32x4_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_p128 (poly128_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_s64 (int64x2_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_u64 (uint64x2_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_s8 (int8x16_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_s16 (int16x8_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_s32 (int32x4_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_u8 (uint8x16_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_u16 (uint16x8_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p64_u32 (uint32x4_t __a)
+ {
+ return (poly64x2_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_p8 (poly8x16_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_p16 (poly16x8_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_f16 (float16x8_t __a)
+ {
+ return (poly128_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_f32 (float32x4_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_p64 (poly64x2_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_s64 (int64x2_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_u64 (uint64x2_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_s8 (int8x16_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_s16 (int16x8_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_s32 (int32x4_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_u8 (uint8x16_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_u16 (uint16x8_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_p128_u32 (uint32x4_t __a)
+ {
+ return (poly128_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_p8 (poly8x16_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_p16 (poly16x8_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_f16 (float16x8_t __a)
+ {
+ return (int64x2_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_f32 (float32x4_t __a)
+ {
+ return (int64x2_t)__a;
+@@ -14020,82 +16075,95 @@ vreinterpretq_s64_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_p64 (poly64x2_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_p128 (poly128_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_u64 (uint64x2_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_s8 (int8x16_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_s16 (int16x8_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_s32 (int32x4_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_u8 (uint8x16_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_u16 (uint16x8_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s64_u32 (uint32x4_t __a)
+ {
+ return (int64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_p8 (poly8x16_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_p16 (poly16x8_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_f16 (float16x8_t __a)
+ {
+ return (uint64x2_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_f32 (float32x4_t __a)
+ {
+ return (uint64x2_t)__a;
+@@ -14103,82 +16171,95 @@ vreinterpretq_u64_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_p64 (poly64x2_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_p128 (poly128_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_s64 (int64x2_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_s8 (int8x16_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_s16 (int16x8_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_s32 (int32x4_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_u8 (uint8x16_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_u16 (uint16x8_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u64_u32 (uint32x4_t __a)
+ {
+ return (uint64x2_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_p8 (poly8x16_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_p16 (poly16x8_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_f16 (float16x8_t __a)
+ {
+ return (int8x16_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_f32 (float32x4_t __a)
+ {
+ return (int8x16_t)__a;
+@@ -14186,82 +16267,95 @@ vreinterpretq_s8_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_p64 (poly64x2_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_p128 (poly128_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_s64 (int64x2_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_u64 (uint64x2_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_s16 (int16x8_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_s32 (int32x4_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_u8 (uint8x16_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_u16 (uint16x8_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s8_u32 (uint32x4_t __a)
+ {
+ return (int8x16_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_p8 (poly8x16_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_p16 (poly16x8_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_f16 (float16x8_t __a)
+ {
+ return (int16x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_f32 (float32x4_t __a)
+ {
+ return (int16x8_t)__a;
+@@ -14269,82 +16363,95 @@ vreinterpretq_s16_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_p64 (poly64x2_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_p128 (poly128_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_s64 (int64x2_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_u64 (uint64x2_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_s8 (int8x16_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_s32 (int32x4_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_u8 (uint8x16_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_u16 (uint16x8_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s16_u32 (uint32x4_t __a)
+ {
+ return (int16x8_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_p8 (poly8x16_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_p16 (poly16x8_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_f16 (float16x8_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+ #endif
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_f32 (float32x4_t __a)
+ {
+ return (int32x4_t)__a;
+@@ -14352,82 +16459,95 @@ vreinterpretq_s32_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_p64 (poly64x2_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_p128 (poly128_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_s64 (int64x2_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_u64 (uint64x2_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_s8 (int8x16_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_s16 (int16x8_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_u8 (uint8x16_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_u16 (uint16x8_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_s32_u32 (uint32x4_t __a)
+ {
+ return (int32x4_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_p8 (poly8x16_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_p16 (poly16x8_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_f16 (float16x8_t __a)
+ {
+ return (uint8x16_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_f32 (float32x4_t __a)
+ {
+ return (uint8x16_t)__a;
+@@ -14435,82 +16555,95 @@ vreinterpretq_u8_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_p64 (poly64x2_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_p128 (poly128_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_s64 (int64x2_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_u64 (uint64x2_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_s8 (int8x16_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_s16 (int16x8_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_s32 (int32x4_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_u16 (uint16x8_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u8_u32 (uint32x4_t __a)
+ {
+ return (uint8x16_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_p8 (poly8x16_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_p16 (poly16x8_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_f16 (float16x8_t __a)
+ {
+ return (uint16x8_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_f32 (float32x4_t __a)
+ {
+ return (uint16x8_t)__a;
+@@ -14518,82 +16651,95 @@ vreinterpretq_u16_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_p64 (poly64x2_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_p128 (poly128_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_s64 (int64x2_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_u64 (uint64x2_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_s8 (int8x16_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_s16 (int16x8_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_s32 (int32x4_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_u8 (uint8x16_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u16_u32 (uint32x4_t __a)
+ {
+ return (uint16x8_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_p8 (poly8x16_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_p16 (poly16x8_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_f16 (float16x8_t __a)
+ {
+ return (uint32x4_t) __a;
+ }
+ #endif
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_f32 (float32x4_t __a)
+ {
+ return (uint32x4_t)__a;
+@@ -14601,56 +16747,65 @@ vreinterpretq_u32_f32 (float32x4_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_p64 (poly64x2_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_p128 (poly128_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+ #pragma GCC pop_options
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_s64 (int64x2_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_u64 (uint64x2_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_s8 (int8x16_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_s16 (int16x8_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_s32 (int32x4_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_u8 (uint8x16_t __a)
+ {
+ return (uint32x4_t)__a;
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vreinterpretq_u32_u16 (uint16x8_t __a)
+ {
+ return (uint32x4_t)__a;
+@@ -14659,7 +16814,8 @@ vreinterpretq_u32_u16 (uint16x8_t __a)
+
+ #pragma GCC push_options
+ #pragma GCC target ("fpu=crypto-neon-fp-armv8")
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vldrq_p128 (poly128_t const * __ptr)
+ {
+ #ifdef __ARM_BIG_ENDIAN
+@@ -14672,7 +16828,8 @@ vldrq_p128 (poly128_t const * __ptr)
+ #endif
+ }
+
+-__extension__ static __inline void __attribute__ ((__always_inline__))
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
+ {
+ #ifdef __ARM_BIG_ENDIAN
+@@ -14695,7 +16852,8 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
+ If the result is all zeroes for any half then the whole result is zeroes.
+ This is what the pairwise min reduction achieves. */
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
+ {
+ uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
+@@ -14710,7 +16868,8 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
+ a reduction with max since if any two corresponding bits
+ in the two poly64_t's match, then the whole result must be all ones. */
+
+-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtst_p64 (poly64x1_t __a, poly64x1_t __b)
+ {
+ uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
+@@ -14720,31 +16879,36 @@ vtst_p64 (poly64x1_t __a, poly64x1_t __b)
+ return vreinterpret_u64_u32 (__m);
+ }
-@@ -10958,13 +11438,16 @@
- ;; We only care about the lower 16 bits of the constant
- ;; being inserted into the upper 16 bits of the register.
- (define_insn "*arm_movtas_ze"
-- [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
-+ [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r,r")
- (const_int 16)
- (const_int 16))
- (match_operand:SI 1 "const_int_operand" ""))]
-- "arm_arch_thumb2"
-- "movt%?\t%0, %L1"
-- [(set_attr "predicable" "yes")
-+ "TARGET_HAVE_MOVT"
-+ "@
-+ movt%?\t%0, %L1
-+ movt\t%0, %L1"
-+ [(set_attr "arch" "32,v8mb")
-+ (set_attr "predicable" "yes")
- (set_attr "predicable_short_it" "no")
- (set_attr "length" "4")
- (set_attr "type" "alu_sreg")]
---- a/src/gcc/config/arm/arm.opt
-+++ b/src/gcc/config/arm/arm.opt
-@@ -109,6 +109,10 @@ mfloat-abi=
- Target RejectNegative Joined Enum(float_abi_type) Var(arm_float_abi) Init(TARGET_DEFAULT_FLOAT_ABI)
- Specify if floating point hardware should be used.
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
+ {
+ return __builtin_arm_crypto_aese (__data, __key);
+ }
-+mcmse
-+Target RejectNegative Var(use_cmse)
-+Specify that the compiler should target secure code as per ARMv8-M Security Extensions.
-+
- Enum
- Name(float_abi_type) Type(enum float_abi_type)
- Known floating-point ABIs (for use with the -mfloat-abi= option):
---- /dev/null
-+++ b/src/gcc/config/arm/arm_cmse.h
-@@ -0,0 +1,199 @@
-+/* ARMv8-M Secure Extensions intrinsics include file.
-+
-+ Copyright (C) 2015-2016 Free Software Foundation, Inc.
-+ Contributed by ARM Ltd.
-+
-+ This file is part of GCC.
-+
-+ GCC is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published
-+ by the Free Software Foundation; either version 3, or (at your
-+ option) any later version.
-+
-+ GCC is distributed in the hope that it will be useful, but WITHOUT
-+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
-+ License for more details.
-+
-+ Under Section 7 of GPL version 3, you are granted additional
-+ permissions described in the GCC Runtime Library Exception, version
-+ 3.1, as published by the Free Software Foundation.
-+
-+ You should have received a copy of the GNU General Public License and
-+ a copy of the GCC Runtime Library Exception along with this program;
-+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
-+ <http://www.gnu.org/licenses/>. */
-+
-+
-+#ifndef _GCC_ARM_CMSE_H
-+#define _GCC_ARM_CMSE_H
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+#if __ARM_FEATURE_CMSE & 1
-+
-+#include <stddef.h>
-+#include <stdint.h>
-+
-+#ifdef __ARM_BIG_ENDIAN
-+
-+typedef union {
-+ struct cmse_address_info {
-+#if __ARM_FEATURE_CMSE & 2
-+ unsigned idau_region:8;
-+ unsigned idau_region_valid:1;
-+ unsigned secure:1;
-+ unsigned nonsecure_readwrite_ok:1;
-+ unsigned nonsecure_read_ok:1;
-+#else
-+ unsigned :12;
-+#endif
-+ unsigned readwrite_ok:1;
-+ unsigned read_ok:1;
-+#if __ARM_FEATURE_CMSE & 2
-+ unsigned sau_region_valid:1;
-+#else
-+ unsigned :1;
-+#endif
-+ unsigned mpu_region_valid:1;
-+#if __ARM_FEATURE_CMSE & 2
-+ unsigned sau_region:8;
-+#else
-+ unsigned :8;
-+#endif
-+ unsigned mpu_region:8;
-+ } flags;
-+ unsigned value;
-+} cmse_address_info_t;
-+
-+#else
-+
-+typedef union {
-+ struct cmse_address_info {
-+ unsigned mpu_region:8;
-+#if __ARM_FEATURE_CMSE & 2
-+ unsigned sau_region:8;
-+#else
-+ unsigned :8;
-+#endif
-+ unsigned mpu_region_valid:1;
-+#if __ARM_FEATURE_CMSE & 2
-+ unsigned sau_region_valid:1;
-+#else
-+ unsigned :1;
-+#endif
-+ unsigned read_ok:1;
-+ unsigned readwrite_ok:1;
-+#if __ARM_FEATURE_CMSE & 2
-+ unsigned nonsecure_read_ok:1;
-+ unsigned nonsecure_readwrite_ok:1;
-+ unsigned secure:1;
-+ unsigned idau_region_valid:1;
-+ unsigned idau_region:8;
-+#else
-+ unsigned :12;
-+#endif
-+ } flags;
-+ unsigned value;
-+} cmse_address_info_t;
-+
-+#endif /* __ARM_BIG_ENDIAN */
-+
-+#define cmse_TT_fptr(p) (__cmse_TT_fptr ((__cmse_fptr)(p)))
-+
-+typedef void (*__cmse_fptr)(void);
-+
-+#define __CMSE_TT_ASM(flags) \
-+{ \
-+ cmse_address_info_t __result; \
-+ __asm__ ("tt" # flags " %0,%1" \
-+ : "=r"(__result) \
-+ : "r"(__p) \
-+ : "memory"); \
-+ return __result; \
-+}
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+__cmse_TT_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM ()
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+cmse_TT (void *__p)
-+__CMSE_TT_ASM ()
-+
-+#define cmse_TTT_fptr(p) (__cmse_TTT_fptr ((__cmse_fptr)(p)))
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+__cmse_TTT_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM (t)
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+cmse_TTT (void *__p)
-+__CMSE_TT_ASM (t)
-+
-+#if __ARM_FEATURE_CMSE & 2
-+
-+#define cmse_TTA_fptr(p) (__cmse_TTA_fptr ((__cmse_fptr)(p)))
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+__cmse_TTA_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM (a)
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+cmse_TTA (void *__p)
-+__CMSE_TT_ASM (a)
-+
-+#define cmse_TTAT_fptr(p) (__cmse_TTAT_fptr ((__cmse_fptr)(p)))
-+
-+__extension__ static __inline cmse_address_info_t
-+__attribute__ ((__always_inline__))
-+__cmse_TTAT_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM (at)
-+
-+__extension__ static __inline cmse_address_info_t
-+__attribute__ ((__always_inline__))
-+cmse_TTAT (void *__p)
-+__CMSE_TT_ASM (at)
-+
-+/* FIXME: diagnose use outside cmse_nonsecure_entry functions. */
-+__extension__ static __inline int __attribute__ ((__always_inline__))
-+cmse_nonsecure_caller (void)
-+{
-+ return __builtin_arm_cmse_nonsecure_caller ();
-+}
-+
-+#define CMSE_AU_NONSECURE 2
-+#define CMSE_MPU_NONSECURE 16
-+#define CMSE_NONSECURE 18
-+
-+#define cmse_nsfptr_create(p) ((typeof ((p))) ((intptr_t) (p) & ~1))
-+
-+#define cmse_is_nsfptr(p) (!((intptr_t) (p) & 1))
-+
-+#endif /* __ARM_FEATURE_CMSE & 2 */
-+
-+#define CMSE_MPU_UNPRIV 4
-+#define CMSE_MPU_READWRITE 1
-+#define CMSE_MPU_READ 8
-+
-+__extension__ void *
-+cmse_check_address_range (void *, size_t, int);
-+
-+#define cmse_check_pointed_object(p, f) \
-+ ((typeof ((p))) cmse_check_address_range ((p), sizeof (*(p)), (f)))
-+
-+#endif /* __ARM_FEATURE_CMSE & 1 */
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* _GCC_ARM_CMSE_H */
---- /dev/null
-+++ b/src/gcc/config/arm/arm_fp16.h
-@@ -0,0 +1,255 @@
-+/* ARM FP16 intrinsics include file.
-+
-+ Copyright (C) 2016 Free Software Foundation, Inc.
-+ Contributed by ARM Ltd.
-+
-+ This file is part of GCC.
-+
-+ GCC is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published
-+ by the Free Software Foundation; either version 3, or (at your
-+ option) any later version.
-+
-+ GCC is distributed in the hope that it will be useful, but WITHOUT
-+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
-+ License for more details.
-+
-+ Under Section 7 of GPL version 3, you are granted additional
-+ permissions described in the GCC Runtime Library Exception, version
-+ 3.1, as published by the Free Software Foundation.
-+
-+ You should have received a copy of the GNU General Public License and
-+ a copy of the GCC Runtime Library Exception along with this program;
-+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
-+ <http://www.gnu.org/licenses/>. */
-+
-+#ifndef _GCC_ARM_FP16_H
-+#define _GCC_ARM_FP16_H 1
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+#include <stdint.h>
-+
-+/* Intrinsics for FP16 instructions. */
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=fp-armv8")
-+
-+#if defined (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-+
-+typedef __fp16 float16_t;
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vabsh_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vabshf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vaddh_f16 (float16_t __a, float16_t __b)
-+{
-+ return __a + __b;
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtah_s32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtahssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtah_u32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtahusi (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_s32 (int32_t __a)
-+{
-+ return __builtin_neon_vcvthshf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_u32 (uint32_t __a)
-+{
-+ return __builtin_neon_vcvthuhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_s32 (int32_t __a, const int __b)
-+{
-+ return __builtin_neon_vcvths_nhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_u32 (uint32_t __a, const int __b)
-+{
-+ return __builtin_neon_vcvthu_nhf ((int32_t)__a, __b);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvth_n_s32_f16 (float16_t __a, const int __b)
-+{
-+ return __builtin_neon_vcvths_nsi (__a, __b);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvth_n_u32_f16 (float16_t __a, const int __b)
-+{
-+ return (uint32_t)__builtin_neon_vcvthu_nsi (__a, __b);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvth_s32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvthssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvth_u32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvthusi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtmh_s32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtmhssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtmh_u32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtmhusi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtnh_s32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtnhssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtnh_u32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtnhusi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtph_s32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtphssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtph_u32_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vcvtphusi (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vdivh_f16 (float16_t __a, float16_t __b)
-+{
-+ return __a / __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
-+{
-+ return __builtin_neon_vfmahf (__a, __b, __c);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
-+{
-+ return __builtin_neon_vfmshf (__a, __b, __c);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmaxnmh_f16 (float16_t __a, float16_t __b)
-+{
-+ return __builtin_neon_vmaxnmhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vminnmh_f16 (float16_t __a, float16_t __b)
-+{
-+ return __builtin_neon_vminnmhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmulh_f16 (float16_t __a, float16_t __b)
-+{
-+ return __a * __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vnegh_f16 (float16_t __a)
-+{
-+ return - __a;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndah_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndahf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndh_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndih_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndihf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndmh_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndmhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndnh_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndnhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndph_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndphf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndxh_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vrndxhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vsqrth_f16 (float16_t __a)
-+{
-+ return __builtin_neon_vsqrthf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vsubh_f16 (float16_t __a, float16_t __b)
-+{
-+ return __a - __b;
-+}
-+
-+#endif /* __ARM_FEATURE_FP16_SCALAR_ARITHMETIC */
-+#pragma GCC pop_options
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif
---- a/src/gcc/config/arm/arm_neon.h
-+++ b/src/gcc/config/arm/arm_neon.h
-@@ -38,6 +38,7 @@
- extern "C" {
- #endif
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
+ {
+ return __builtin_arm_crypto_aesd (__data, __key);
+ }
-+#include <arm_fp16.h>
- #include <stdint.h>
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaesmcq_u8 (uint8x16_t __data)
+ {
+ return __builtin_arm_crypto_aesmc (__data);
+ }
- typedef __simd64_int8_t int8x8_t;
-@@ -530,7 +531,7 @@ vadd_s32 (int32x2_t __a, int32x2_t __b)
- __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
- vadd_f32 (float32x2_t __a, float32x2_t __b)
+-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint8x16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vaesimcq_u8 (uint8x16_t __data)
{
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
- return __a + __b;
- #else
- return (float32x2_t) __builtin_neon_vaddv2sf (__a, __b);
-@@ -594,7 +595,7 @@ vaddq_s64 (int64x2_t __a, int64x2_t __b)
- __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
- vaddq_f32 (float32x4_t __a, float32x4_t __b)
+ return __builtin_arm_crypto_aesimc (__data);
+ }
+
+-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha1h_u32 (uint32_t __hash_e)
{
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
- return __a + __b;
- #else
- return (float32x4_t) __builtin_neon_vaddv4sf (__a, __b);
-@@ -1030,7 +1031,7 @@ vmul_s32 (int32x2_t __a, int32x2_t __b)
- __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
- vmul_f32 (float32x2_t __a, float32x2_t __b)
+ uint32x4_t __t = vdupq_n_u32 (0);
+@@ -14753,7 +16917,8 @@ vsha1h_u32 (uint32_t __hash_e)
+ return vgetq_lane_u32 (__t, 0);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
{
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
- return __a * __b;
- #else
- return (float32x2_t) __builtin_neon_vmulfv2sf (__a, __b);
-@@ -1077,7 +1078,7 @@ vmulq_s32 (int32x4_t __a, int32x4_t __b)
- __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
- vmulq_f32 (float32x4_t __a, float32x4_t __b)
+ uint32x4_t __t = vdupq_n_u32 (0);
+@@ -14761,7 +16926,8 @@ vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
+ return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
{
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
- return __a * __b;
- #else
- return (float32x4_t) __builtin_neon_vmulfv4sf (__a, __b);
-@@ -1678,7 +1679,7 @@ vsub_s32 (int32x2_t __a, int32x2_t __b)
- __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
- vsub_f32 (float32x2_t __a, float32x2_t __b)
+ uint32x4_t __t = vdupq_n_u32 (0);
+@@ -14769,7 +16935,8 @@ vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
+ return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
{
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
- return __a - __b;
- #else
- return (float32x2_t) __builtin_neon_vsubv2sf (__a, __b);
-@@ -1742,7 +1743,7 @@ vsubq_s64 (int64x2_t __a, int64x2_t __b)
- __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
- vsubq_f32 (float32x4_t __a, float32x4_t __b)
+ uint32x4_t __t = vdupq_n_u32 (0);
+@@ -14777,49 +16944,57 @@ vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
+ return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
{
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
- return __a - __b;
- #else
- return (float32x4_t) __builtin_neon_vsubv4sf (__a, __b);
-@@ -2607,6 +2608,12 @@ vtst_p8 (poly8x8_t __a, poly8x8_t __b)
- return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
+ return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
}
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+vtst_p16 (poly16x4_t __a, poly16x4_t __b)
-+{
-+ return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
-+}
-+
- __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
- vtstq_s8 (int8x16_t __a, int8x16_t __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
{
-@@ -2649,6 +2656,12 @@ vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
- return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
+ return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
}
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
-+{
-+ return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
-+}
-+
- __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
- vabd_s8 (int8x8_t __a, int8x8_t __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
{
-@@ -2943,6 +2956,34 @@ vmaxq_f32 (float32x4_t __a, float32x4_t __b)
- return (float32x4_t)__builtin_neon_vmaxfv4sf (__a, __b);
+ return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
}
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=neon-fp-armv8")
-+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+vmaxnm_f32 (float32x2_t a, float32x2_t b)
-+{
-+ return (float32x2_t)__builtin_neon_vmaxnmv2sf (a, b);
-+}
-+
-+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+vmaxnmq_f32 (float32x4_t a, float32x4_t b)
-+{
-+ return (float32x4_t)__builtin_neon_vmaxnmv4sf (a, b);
-+}
-+
-+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+vminnm_f32 (float32x2_t a, float32x2_t b)
-+{
-+ return (float32x2_t)__builtin_neon_vminnmv2sf (a, b);
-+}
-+
-+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+vminnmq_f32 (float32x4_t a, float32x4_t b)
-+{
-+ return (float32x4_t)__builtin_neon_vminnmv4sf (a, b);
-+}
-+#pragma GCC pop_options
-+
-+
- __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
- vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
{
-@@ -5370,6 +5411,15 @@ vget_lane_s64 (int64x1_t __a, const int __b)
- return (int64_t)__builtin_neon_vget_lanedi (__a, __b);
+ return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
}
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=crypto-neon-fp-armv8")
-+__extension__ static __inline poly64_t __attribute__ ((__always_inline__))
-+vget_lane_p64 (poly64x1_t __a, const int __b)
-+{
-+ return (poly64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
-+}
-+
-+#pragma GCC pop_options
- __extension__ static __inline uint64_t __attribute__ ((__always_inline__))
- vget_lane_u64 (uint64x1_t __a, const int __b)
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
+ {
+ return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
+ }
+
+-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
{
-@@ -14830,6 +14880,855 @@ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
+ return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_p64 (poly64_t __a, poly64_t __b)
+ {
+ return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
+ }
+
+-__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
++__extension__ extern __inline poly128_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
+ {
+ poly64_t __t1 = vget_high_p64 (__a);
+@@ -14830,6 +17005,984 @@ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
#pragma GCC pop_options
@@ -57362,667 +77013,778 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+#pragma GCC target ("fpu=neon-fp-armv8")
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabd_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vabdv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabdq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vabdv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabs_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vabsv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vabsq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vabsv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vaddv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vaddv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcage_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcagev4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcageq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcagev8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagt_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcagtv4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcagtq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcagtv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcale_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcalev4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaleq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcalev8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcalt_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcaltv4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcaltq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcaltv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceq_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vceqv4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vceqv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vceqzv4hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vceqzv8hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcge_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgev4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgeq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgev8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgez_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcgezv4hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgezq_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcgezv8hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgt_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgtv4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgtv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtz_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcgtzv4hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcgtzq_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcgtzv8hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcle_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vclev4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcleq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vclev8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclez_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vclezv4hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclezq_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vclezv8hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclt_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcltv4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcltv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltz_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcltzv4hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcltzq_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcltzv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_s16 (int16x4_t __a)
+{
+ return (float16x4_t)__builtin_neon_vcvtsv4hi (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f16_u16 (uint16x4_t __a)
+{
+ return (float16x4_t)__builtin_neon_vcvtuv4hi ((int16x4_t)__a);
+}
+
-+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_s16_f16 (float16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vcvtsv4hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_u16_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcvtuv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f16_s16 (int16x8_t __a)
+{
+ return (float16x8_t)__builtin_neon_vcvtsv8hi (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_f16_u16 (uint16x8_t __a)
+{
+ return (float16x8_t)__builtin_neon_vcvtuv8hi ((int16x8_t)__a);
+}
+
-+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_s16_f16 (float16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vcvtsv8hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_u16_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcvtuv8hf (__a);
+}
+
-+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_s16_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vcvtasv4hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvta_u16_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcvtauv4hf (__a);
+}
+
-+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_s16_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vcvtasv8hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtaq_u16_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcvtauv8hf (__a);
+}
+
-+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_s16_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vcvtmsv4hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtm_u16_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcvtmuv4hf (__a);
+}
+
-+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_s16_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vcvtmsv8hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtmq_u16_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcvtmuv8hf (__a);
+}
+
-+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_s16_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vcvtnsv4hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtn_u16_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcvtnuv4hf (__a);
+}
+
-+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_s16_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vcvtnsv8hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtnq_u16_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcvtnuv8hf (__a);
+}
+
-+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_s16_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vcvtpsv4hf (__a);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtp_u16_f16 (float16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcvtpuv4hf (__a);
+}
+
-+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_s16_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vcvtpsv8hf (__a);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtpq_u16_f16 (float16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vcvtpuv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f16_s16 (int16x4_t __a, const int __b)
+{
+ return __builtin_neon_vcvts_nv4hi (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
+{
+ return __builtin_neon_vcvtu_nv4hi ((int16x4_t)__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
+{
+ return __builtin_neon_vcvts_nv8hi (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
+{
+ return __builtin_neon_vcvtu_nv8hi ((int16x8_t)__a, __b);
+}
+
-+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_s16_f16 (float16x4_t __a, const int __b)
+{
+ return __builtin_neon_vcvts_nv4hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_n_u16_f16 (float16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vcvtu_nv4hf (__a, __b);
+}
+
-+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline int16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
+{
+ return __builtin_neon_vcvts_nv8hf (__a, __b);
+}
+
-+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline uint16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vcvtu_nv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+ return __builtin_neon_vfmav4hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+ return __builtin_neon_vfmav8hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+ return __builtin_neon_vfmsv4hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+ return __builtin_neon_vfmsv8hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmax_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vmaxfv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vmaxfv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vmaxnmv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vmaxnmv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmin_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vminfv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vminfv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnm_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vminnmv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vminnmq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vminnmv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vmulfv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __c)
+{
+ return __builtin_neon_vmul_lanev4hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmul_n_f16 (float16x4_t __a, float16_t __b)
+{
+ return __builtin_neon_vmul_nv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vmulfv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __c)
+{
+ return __builtin_neon_vmul_lanev8hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmulq_n_f16 (float16x8_t __a, float16_t __b)
+{
+ return __builtin_neon_vmul_nv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vneg_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vnegv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vnegq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vnegv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpadd_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vpaddv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmax_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vpmaxfv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vpmin_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vpminfv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpe_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrecpev4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpeq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrecpev8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrndv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrndv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnda_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrndav4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndaq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrndav8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndm_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrndmv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndmq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrndmv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndn_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrndnv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndnq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrndnv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndp_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrndpv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndpq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrndpv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndx_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrndxv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndxq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrndxv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrte_f16 (float16x4_t __a)
+{
+ return __builtin_neon_vrsqrtev4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrteq_f16 (float16x8_t __a)
+{
+ return __builtin_neon_vrsqrtev8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecps_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vrecpsv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vrecpsv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vrsqrtsv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vrsqrtsv8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsub_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vsubv4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsubq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vsubv8hf (__a, __b);
@@ -58034,73 +77796,85 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ /* Half-precision data processing intrinsics. */
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+ return __builtin_neon_vbslv4hf ((int16x4_t)__a, __b, __c);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+ return __builtin_neon_vbslv8hf ((int16x8_t)__a, __b, __c);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_f16 (float16_t __a)
+{
+ return __builtin_neon_vdup_nv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_f16 (float16_t __a)
+{
+ return __builtin_neon_vdup_nv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_f16 (float16x4_t __a, const int __b)
+{
+ return __builtin_neon_vdup_lanev4hf (__a, __b);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_f16 (float16x4_t __a, const int __b)
+{
+ return __builtin_neon_vdup_lanev8hf (__a, __b);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vext_f16 (float16x4_t __a, float16x4_t __b, const int __c)
+{
+ return __builtin_neon_vextv4hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vextq_f16 (float16x8_t __a, float16x8_t __b, const int __c)
+{
+ return __builtin_neon_vextv8hf (__a, __b, __c);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmov_n_f16 (float16_t __a)
+{
+ return __builtin_neon_vdup_nv4hf (__a);
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmovq_n_f16 (float16_t __a)
+{
+ return __builtin_neon_vdup_nv8hf (__a);
+}
+
-+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64_f16 (float16x4_t __a)
+{
+ return (float16x4_t)__builtin_shuffle (__a, (uint16x4_t){ 3, 2, 1, 0 });
+}
+
-+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrev64q_f16 (float16x8_t __a)
+{
+ return
@@ -58108,7 +77882,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ (uint16x8_t){ 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
-+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn_f16 (float16x4_t __a, float16x4_t __b)
+{
+ float16x4x2_t __rv;
@@ -58122,7 +77897,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ return __rv;
+}
+
-+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ float16x8x2_t __rv;
@@ -58140,7 +77916,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ return __rv;
+}
+
-+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp_f16 (float16x4_t __a, float16x4_t __b)
+{
+ float16x4x2_t __rv;
@@ -58154,7 +77931,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ return __rv;
+}
+
-+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzpq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ float16x8x2_t __rv;
@@ -58172,7 +77950,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ return __rv;
+}
+
-+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip_f16 (float16x4_t __a, float16x4_t __b)
+{
+ float16x4x2_t __rv;
@@ -58186,7 +77965,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ return __rv;
+}
+
-+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
++__extension__ extern __inline float16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzipq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ float16x8x2_t __rv;
@@ -58609,7 +78389,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
crc,logic_shift_imm,logics_shift_imm,
- alu_ext,alus_ext,
- extend,mov_shift,mvn_shift"))
-+ alu_ext,alus_ext,bfm,extend,mvn_shift"))
++ alu_ext,alus_ext,bfm,bfx,extend,mvn_shift"))
"cortex_a53_slot_any")
(define_insn_reservation "cortex_a53_alu_shift_reg" 3
@@ -58643,19 +78423,19 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
-;; Model bypasses for unshifted operands to ALU instructions.
+;; Model bypasses for ALU to ALU instructions.
++
++(define_bypass 0 "cortex_a53_shift*"
++ "cortex_a53_alu")
-(define_bypass 1 "cortex_a53_shift"
- "cortex_a53_shift")
-+(define_bypass 0 "cortex_a53_shift*"
-+ "cortex_a53_alu")
++(define_bypass 1 "cortex_a53_shift*"
++ "cortex_a53_shift*,cortex_a53_alu_*")
-(define_bypass 1 "cortex_a53_alu,
- cortex_a53_alu_shift*,
- cortex_a53_alu_rotate_imm,
- cortex_a53_shift"
-+(define_bypass 1 "cortex_a53_shift*"
-+ "cortex_a53_shift*,cortex_a53_alu_*")
-+
+(define_bypass 1 "cortex_a53_alu*"
"cortex_a53_alu")
@@ -58669,12 +78449,12 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
-;; bypass to the accumulator operand of an integer MADD-like operation.
+(define_bypass 2 "cortex_a53_alu*"
+ "cortex_a53_alu_*,cortex_a53_shift*")
++
++;; Model a bypass from MUL/MLA to MLA instructions.
-(define_bypass 1 "cortex_a53_alu*,
- cortex_a53_load*,
- cortex_a53_mul"
-+;; Model a bypass from MUL/MLA to MLA instructions.
-+
+(define_bypass 1 "cortex_a53_mul"
"cortex_a53_mul"
"aarch_accumulator_forwarding")
@@ -58698,7 +78478,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
;; Model bypasses for loads which are to be consumed by the ALU.
-@@ -239,47 +236,37 @@
+@@ -239,47 +236,46 @@
"cortex_a53_alu")
(define_bypass 3 "cortex_a53_load1"
@@ -58739,28 +78519,71 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"cortex_a53_r2f")
-(define_bypass 2 "cortex_a53_mul,
-- cortex_a53_load1,
++(define_bypass 1 "cortex_a53_mul,
+ cortex_a53_load1,
- cortex_a53_load2,
- cortex_a53_load3plus"
-+(define_bypass 1 "cortex_a53_mul,
-+ cortex_a53_load*"
++ cortex_a53_load2"
"cortex_a53_r2f")
-;; Shifts feeding Load/Store addresses may not be ready in time.
-+;; Model flag forwarding to branches.
++(define_bypass 2 "cortex_a53_alu*"
++ "cortex_a53_r2f_cvt")
-(define_bypass 3 "cortex_a53_shift"
- "cortex_a53_load*"
- "arm_early_load_addr_dep")
--
++(define_bypass 3 "cortex_a53_mul,
++ cortex_a53_load1,
++ cortex_a53_load2"
++ "cortex_a53_r2f_cvt")
+
-(define_bypass 3 "cortex_a53_shift"
- "cortex_a53_store*"
- "arm_early_store_addr_dep")
++;; Model flag forwarding to branches.
++
+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
+ "cortex_a53_branch")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point/Advanced SIMD.
+@@ -535,19 +531,25 @@
+ ;; Floating-point to/from core transfers.
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+-(define_insn_reservation "cortex_a53_r2f" 6
++(define_insn_reservation "cortex_a53_r2f" 2
+ (and (eq_attr "tune" "cortexa53")
+- (eq_attr "type" "f_mcr,f_mcrr,f_cvti2f,
+- neon_from_gp, neon_from_gp_q"))
+- "cortex_a53_slot_any,cortex_a53_store,
+- nothing,cortex_a53_fp_alu")
++ (eq_attr "type" "f_mcr,f_mcrr"))
++ "cortex_a53_slot_any,cortex_a53_fp_alu")
+
+-(define_insn_reservation "cortex_a53_f2r" 6
++(define_insn_reservation "cortex_a53_f2r" 4
+ (and (eq_attr "tune" "cortexa53")
+- (eq_attr "type" "f_mrc,f_mrrc,f_cvtf2i,
+- neon_to_gp, neon_to_gp_q"))
+- "cortex_a53_slot_any,cortex_a53_fp_alu,
+- nothing,cortex_a53_store")
++ (eq_attr "type" "f_mrc,f_mrrc"))
++ "cortex_a53_slot_any,cortex_a53_fp_alu")
++
++(define_insn_reservation "cortex_a53_r2f_cvt" 4
++ (and (eq_attr "tune" "cortexa53")
++ (eq_attr "type" "f_cvti2f, neon_from_gp, neon_from_gp_q"))
++ "cortex_a53_slot_any,cortex_a53_fp_alu")
++
++(define_insn_reservation "cortex_a53_f2r_cvt" 5
++ (and (eq_attr "tune" "cortexa53")
++ (eq_attr "type" "f_cvtf2i, neon_to_gp, neon_to_gp_q"))
++ "cortex_a53_slot_any,cortex_a53_fp_alu")
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Floating-point flag transfer.
--- a/src/gcc/config/arm/cortex-a57.md
+++ b/src/gcc/config/arm/cortex-a57.md
@@ -297,7 +297,7 @@
@@ -58768,10 +78591,19 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
alu_sreg,alus_sreg,logic_reg,logics_reg,\
adc_imm,adcs_imm,adc_reg,adcs_reg,\
- adr,bfm,clz,rbit,rev,alu_dsp_reg,\
-+ adr,bfm,clz,csel,rbit,rev,alu_dsp_reg,\
++ adr,bfx,extend,clz,rbit,rev,alu_dsp_reg,\
rotate_imm,shift_imm,shift_reg,\
mov_imm,mov_reg,\
mvn_imm,mvn_reg,\
+@@ -307,7 +307,7 @@
+ ;; ALU ops with immediate shift
+ (define_insn_reservation "cortex_a57_alu_shift" 3
+ (and (eq_attr "tune" "cortexa57")
+- (eq_attr "type" "extend,\
++ (eq_attr "type" "bfm,\
+ alu_shift_imm,alus_shift_imm,\
+ crc,logic_shift_imm,logics_shift_imm,\
+ mov_shift,mvn_shift"))
@@ -726,7 +726,7 @@
(define_insn_reservation "cortex_a57_fp_cpys" 4
@@ -58903,6 +78735,26 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#undef L_fixdfsi
#undef L_fixunsdfsi
#undef L_truncdfsf2
+--- a/src/gcc/config/arm/exynos-m1.md
++++ b/src/gcc/config/arm/exynos-m1.md
+@@ -358,7 +358,7 @@
+ (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+ alu_sreg, alus_sreg, logic_reg, logics_reg,\
+ adc_imm, adcs_imm, adc_reg, adcs_reg,\
+- adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
++ adr, bfm, bfx, clz, rbit, rev, csel, alu_dsp_reg,\
+ shift_imm, shift_reg, rotate_imm, extend,\
+ mov_imm, mov_reg,\
+ mvn_imm, mvn_reg,\
+@@ -372,7 +372,7 @@
+ (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+ alu_sreg, alus_sreg, logic_reg, logics_reg,\
+ adc_imm, adcs_imm, adc_reg, adcs_reg,\
+- adr, bfm, clz, rbit, rev, alu_dsp_reg,\
++ adr, bfm, bfx, clz, rbit, rev, alu_dsp_reg,\
+ shift_imm, shift_reg, rotate_imm, extend,\
+ mov_imm, mov_reg,\
+ mvn_imm, mvn_reg,\
--- a/src/gcc/config/arm/iterators.md
+++ b/src/gcc/config/arm/iterators.md
@@ -46,7 +46,7 @@
@@ -64348,7 +84200,40 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
;; operation is sufficient to make conditionalizing the sequence
--- a/src/gcc/config/arm/thumb2.md
+++ b/src/gcc/config/arm/thumb2.md
-@@ -278,8 +278,7 @@
+@@ -125,32 +125,6 @@
+ (set_attr "type" "multiple")]
+ )
+
+-;; Thumb-2 does not have rsc, so use a clever trick with shifter operands.
+-(define_insn_and_split "*thumb2_negdi2"
+- [(set (match_operand:DI 0 "s_register_operand" "=&r,r")
+- (neg:DI (match_operand:DI 1 "s_register_operand" "?r,0")))
+- (clobber (reg:CC CC_REGNUM))]
+- "TARGET_THUMB2"
+- "#" ; negs\\t%Q0, %Q1\;sbc\\t%R0, %R1, %R1, lsl #1
+- "&& reload_completed"
+- [(parallel [(set (reg:CC CC_REGNUM)
+- (compare:CC (const_int 0) (match_dup 1)))
+- (set (match_dup 0) (minus:SI (const_int 0) (match_dup 1)))])
+- (set (match_dup 2) (minus:SI (minus:SI (match_dup 3)
+- (ashift:SI (match_dup 3)
+- (const_int 1)))
+- (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
+- {
+- operands[2] = gen_highpart (SImode, operands[0]);
+- operands[0] = gen_lowpart (SImode, operands[0]);
+- operands[3] = gen_highpart (SImode, operands[1]);
+- operands[1] = gen_lowpart (SImode, operands[1]);
+- }
+- [(set_attr "conds" "clob")
+- (set_attr "length" "8")
+- (set_attr "type" "multiple")]
+-)
+-
+ (define_insn_and_split "*thumb2_abssi2"
+ [(set (match_operand:SI 0 "s_register_operand" "=&r,l,r")
+ (abs:SI (match_operand:SI 1 "s_register_operand" "r,0,0")))
+@@ -278,8 +252,7 @@
(define_insn "*thumb2_movsi_insn"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l ,*hk,m,*m")
(match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk"))]
@@ -64358,7 +84243,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
&& ( register_operand (operands[0], SImode)
|| register_operand (operands[1], SImode))"
"@
-@@ -581,6 +580,19 @@
+@@ -581,6 +554,19 @@
[(set_attr "type" "call")]
)
@@ -64378,7 +84263,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "*call_value_reg_thumb2"
[(set (match_operand 0 "" "")
(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
-@@ -592,6 +604,21 @@
+@@ -592,6 +578,21 @@
[(set_attr "type" "call")]
)
@@ -64400,7 +84285,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn "*thumb2_indirect_jump"
[(set (pc)
(match_operand:SI 0 "register_operand" "l*r"))]
-@@ -1115,12 +1142,31 @@
+@@ -1115,12 +1116,31 @@
(define_insn "*thumb2_return"
[(simple_return)]
@@ -64433,6 +84318,24 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
(define_insn_and_split "thumb2_eh_return"
[(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")]
VUNSPEC_EH_RETURN)
+--- a/src/gcc/config/arm/types.md
++++ b/src/gcc/config/arm/types.md
+@@ -51,6 +51,7 @@
+ ; alus_shift_imm as alu_shift_imm, setting condition flags.
+ ; alus_shift_reg as alu_shift_reg, setting condition flags.
+ ; bfm bitfield move operation.
++; bfx bitfield extract operation.
+ ; block blockage insn, this blocks all functional units.
+ ; branch branch.
+ ; call subroutine call.
+@@ -557,6 +558,7 @@
+ alus_shift_imm,\
+ alus_shift_reg,\
+ bfm,\
++ bfx,\
+ block,\
+ branch,\
+ call,\
--- a/src/gcc/config/arm/unspecs.md
+++ b/src/gcc/config/arm/unspecs.md
@@ -84,6 +84,8 @@
@@ -65749,6 +85652,17 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
"mrc\\tp10, 7, %0, cr1, cr0, 0\\t @GET_FPSCR"
[(set_attr "type" "mrs")])
+--- a/src/gcc/config/arm/xgene1.md
++++ b/src/gcc/config/arm/xgene1.md
+@@ -164,7 +164,7 @@
+
+ (define_insn_reservation "xgene1_bfm" 2
+ (and (eq_attr "tune" "xgene1")
+- (eq_attr "type" "bfm"))
++ (eq_attr "type" "bfm,bfx"))
+ "xgene1_decode1op,xgene1_fsu")
+
+ (define_insn_reservation "xgene1_f_rint" 5
--- a/src/gcc/config/i386/i386.c
+++ b/src/gcc/config/i386/i386.c
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3. If not see
@@ -66998,6 +86912,141 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ fn2 (b[1]);
+}
--- /dev/null
++++ b/src/gcc/testsuite/gcc.c-torture/compile/pr78362.c
+@@ -0,0 +1,11 @@
++/* PR target/78362. */
++
++long a;
++
++void
++foo (void)
++{
++ for (;; a--)
++ if ((int) a)
++ break;
++}
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.c-torture/compile/pr78694.c
+@@ -0,0 +1,118 @@
++/* PR target/78694. */
++
++enum
++{
++ MEMMODEL_RELAXED,
++ MEMMODEL_ACQUIRE,
++ PRIORITY_INSERT_END
++};
++enum
++{
++ PQ_CHILDREN,
++ PQ_TASKGROUP
++};
++struct gomp_team_state
++{
++ struct gomp_team *team;
++};
++enum gomp_task_kind
++{
++ GOMP_TASK_UNDEFERRED,
++ GOMP_TASK_WAITING
++};
++struct gomp_taskwait
++{
++ _Bool in_taskwait;
++};
++struct gomp_task
++{
++ struct gomp_task *parent;
++ int children_queue;
++ struct gomp_taskgroup *taskgroup;
++ int dependers;
++ struct gomp_taskwait taskwait;
++ enum gomp_task_kind kind;
++ _Bool in_tied_task;
++} j, q, *n;
++struct gomp_taskgroup
++{
++ _Bool in_taskgroup_wait;
++ int num_children;
++} l;
++struct gomp_team
++{
++ int task_queue;
++ int task_running_count;
++};
++struct gomp_thread
++{
++ struct gomp_team_state ts;
++ struct gomp_task task;
++} extern __thread a;
++
++int b, c, d, e, f, g, h, i, k, m, o, p, r;
++
++void priority_queue_next_task (struct gomp_task *, int, int);
++int gomp_task_run_pre (struct gomp_task *, struct gomp_task, struct gomp_team);
++void priority_queue_insert (int, struct gomp_task);
++void priority_queue_insert2 (int, struct gomp_task, int, int, int);
++void priority_queue_insert3 (int, struct gomp_task, int, int, int);
++void gomp_sem_post (int);
++void free (void *);
++
++_Bool s;
++int
++GOMP_taskgroup_end ()
++{
++ struct gomp_thread *t = &a;
++ struct gomp_team u = *t->ts.team;
++ struct gomp_task *v = &t->task, *w;
++ if (__atomic_load_n (&l.num_children, MEMMODEL_ACQUIRE))
++ while (1)
++ {
++ if (l.num_children)
++ priority_queue_next_task (v, u.task_queue, r);
++ else if (w)
++ free (w);
++ if (n->kind == GOMP_TASK_WAITING)
++ {
++ s = gomp_task_run_pre (n, q, u);
++ if (__builtin_expect (s, 0))
++ {
++ if (w)
++ free (w);
++ goto finish_cancelled;
++ }
++ n = 0;
++ l.in_taskgroup_wait = 1;
++ }
++ if (w)
++ {
++ t->task = *n;
++ if (__builtin_expect (p, 0))
++ if (o)
++ t->task = *v;
++ }
++ if (n)
++ {
++ struct gomp_task x = x;
++ for (; i; b++)
++ {
++ struct gomp_task y = j;
++ if (g)
++ continue;
++ priority_queue_insert (PQ_CHILDREN, x);
++ if (x.taskwait.in_taskwait)
++ priority_queue_insert2 (PQ_TASKGROUP, y, e, 0, d);
++ if (h)
++ gomp_sem_post (f);
++ priority_queue_insert3 (k, y, PRIORITY_INSERT_END, 0, d);
++ ++c;
++ }
++ }
++ finish_cancelled:
++ w = (struct gomp_task *) (n - u.task_running_count - v);
++ }
++ v->taskgroup = (struct gomp_taskgroup *) m;
++ return 1;
++}
+--- /dev/null
+++ b/src/gcc/testsuite/gcc.c-torture/execute/pr37780.c
@@ -0,0 +1,49 @@
+/* PR middle-end/37780. */
@@ -67139,6 +87188,33 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
-#if x // { dg-warning "\"x\" is not defined .-Wundef." }
+#if x // { dg-warning "\"x\" is not defined, evaluates to 0 .-Wundef." }
#endif
+--- a/src/gcc/testsuite/gcc.dg/lto/pr54709_0.c
++++ b/src/gcc/testsuite/gcc.dg/lto/pr54709_0.c
+@@ -1,6 +1,7 @@
+ /* { dg-lto-do link } */
+ /* { dg-require-visibility "hidden" } */
+ /* { dg-require-effective-target fpic } */
++/* { dg-require-effective-target shared } */
+ /* { dg-extra-ld-options { -shared } } */
+ /* { dg-lto-options { { -fPIC -fvisibility=hidden -flto } } } */
+
+--- a/src/gcc/testsuite/gcc.dg/lto/pr61526_0.c
++++ b/src/gcc/testsuite/gcc.dg/lto/pr61526_0.c
+@@ -1,4 +1,5 @@
+ /* { dg-require-effective-target fpic } */
++/* { dg-require-effective-target shared } */
+ /* { dg-lto-do link } */
+ /* { dg-lto-options { { -fPIC -flto -flto-partition=1to1 } } } */
+ /* { dg-extra-ld-options { -shared } } */
+--- a/src/gcc/testsuite/gcc.dg/lto/pr64415_0.c
++++ b/src/gcc/testsuite/gcc.dg/lto/pr64415_0.c
+@@ -1,5 +1,6 @@
+ /* { dg-lto-do link } */
+ /* { dg-require-effective-target fpic } */
++/* { dg-require-effective-target shared } */
+ /* { dg-lto-options { { -O -flto -fpic } } } */
+ /* { dg-extra-ld-options { -shared } } */
+ /* { dg-extra-ld-options "-Wl,-undefined,dynamic_lookup" { target *-*-darwin* } } */
--- a/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
+++ b/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
@@ -87,6 +87,12 @@ foreach plugin_test $plugin_test_list {
@@ -68274,16 +88350,45 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
/* Floating-point variant. */
-@@ -110,7 +125,7 @@ extern size_t strlen(const char *);
+@@ -110,7 +125,36 @@ extern size_t strlen(const char *);
abort(); \
} \
} \
- fprintf(stderr, "CHECKED %s\n", MSG); \
+ fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG); \
++ }
++
++/* poly variant. */
++#define CHECK_POLY(MSG,T,W,N,FMT,EXPECTED,COMMENT) \
++ { \
++ int i; \
++ for(i=0; i<N ; i++) \
++ { \
++ union poly_operand { \
++ uint##W##_t i; \
++ poly##W##_t p; \
++ } tmp_res, tmp_exp; \
++ tmp_res.p = VECT_VAR(result, T, W, N)[i]; \
++ tmp_exp.i = VECT_VAR(EXPECTED, T, W, N)[i]; \
++ if (tmp_res.i != tmp_exp.i) { \
++ fprintf(stderr, \
++ "ERROR in %s (%s line %d in buffer '%s') at type %s " \
++ "index %d: got 0x%" FMT " != 0x%" FMT " %s\n", \
++ MSG, __FILE__, __LINE__, \
++ STR(EXPECTED), \
++ STR(VECT_NAME(T, W, N)), \
++ i, \
++ tmp_res.i, \
++ tmp_exp.i, \
++ strlen(COMMENT) > 0 ? COMMENT : ""); \
++ abort(); \
++ } \
++ } \
++ fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG); \
}
/* Clean buffer with a non-zero pattern to help diagnose buffer
-@@ -133,10 +148,16 @@ static ARRAY(result, uint, 32, 2);
+@@ -133,10 +177,16 @@ static ARRAY(result, uint, 32, 2);
static ARRAY(result, uint, 64, 1);
static ARRAY(result, poly, 8, 8);
static ARRAY(result, poly, 16, 4);
@@ -68300,7 +88405,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
static ARRAY(result, int, 8, 16);
static ARRAY(result, int, 16, 8);
static ARRAY(result, int, 32, 4);
-@@ -147,6 +168,9 @@ static ARRAY(result, uint, 32, 4);
+@@ -147,6 +197,9 @@ static ARRAY(result, uint, 32, 4);
static ARRAY(result, uint, 64, 2);
static ARRAY(result, poly, 8, 16);
static ARRAY(result, poly, 16, 8);
@@ -68310,7 +88415,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
static ARRAY(result, float, 16, 8);
#endif
-@@ -169,6 +193,7 @@ extern ARRAY(expected, poly, 8, 8);
+@@ -169,6 +222,7 @@ extern ARRAY(expected, poly, 8, 8);
extern ARRAY(expected, poly, 16, 4);
extern ARRAY(expected, hfloat, 16, 4);
extern ARRAY(expected, hfloat, 32, 2);
@@ -68318,7 +88423,29 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
extern ARRAY(expected, int, 8, 16);
extern ARRAY(expected, int, 16, 8);
extern ARRAY(expected, int, 32, 4);
-@@ -335,7 +360,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
+@@ -193,8 +247,8 @@ extern ARRAY(expected, hfloat, 64, 2);
+ CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
+ CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment); \
+ \
+ CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment); \
+@@ -205,8 +259,8 @@ extern ARRAY(expected, hfloat, 64, 2);
+ CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment); \
+ CHECK(test_name, uint, 64, 2, PRIx64, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment); \
+ } \
+
+@@ -335,7 +389,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
strlen(COMMENT) > 0 ? " " COMMENT : ""); \
abort(); \
} \
@@ -68328,7 +88455,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
}
#define CHECK_CUMULATIVE_SAT_NAMED(test_name,EXPECTED,comment) \
-@@ -379,6 +405,9 @@ static void clean_results (void)
+@@ -379,6 +434,9 @@ static void clean_results (void)
CLEAN(result, uint, 64, 1);
CLEAN(result, poly, 8, 8);
CLEAN(result, poly, 16, 4);
@@ -68338,7 +88465,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
CLEAN(result, float, 16, 4);
#endif
-@@ -394,6 +423,9 @@ static void clean_results (void)
+@@ -394,6 +452,9 @@ static void clean_results (void)
CLEAN(result, uint, 64, 2);
CLEAN(result, poly, 8, 16);
CLEAN(result, poly, 16, 8);
@@ -68348,7 +88475,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
CLEAN(result, float, 16, 8);
#endif
-@@ -419,6 +451,13 @@ static void clean_results (void)
+@@ -419,6 +480,13 @@ static void clean_results (void)
#define DECL_VARIABLE(VAR, T1, W, N) \
VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
@@ -68362,7 +88489,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Declare only 64 bits signed variants. */
#define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR) \
DECL_VARIABLE(VAR, int, 8, 8); \
-@@ -454,6 +493,7 @@ static void clean_results (void)
+@@ -454,6 +522,7 @@ static void clean_results (void)
DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR); \
DECL_VARIABLE(VAR, poly, 8, 8); \
DECL_VARIABLE(VAR, poly, 16, 4); \
@@ -68370,7 +88497,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DECL_VARIABLE(VAR, float, 16, 4); \
DECL_VARIABLE(VAR, float, 32, 2)
#else
-@@ -462,6 +502,7 @@ static void clean_results (void)
+@@ -462,6 +531,7 @@ static void clean_results (void)
DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR); \
DECL_VARIABLE(VAR, poly, 8, 8); \
DECL_VARIABLE(VAR, poly, 16, 4); \
@@ -68378,7 +88505,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DECL_VARIABLE(VAR, float, 32, 2)
#endif
-@@ -472,6 +513,7 @@ static void clean_results (void)
+@@ -472,6 +542,7 @@ static void clean_results (void)
DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR); \
DECL_VARIABLE(VAR, poly, 8, 16); \
DECL_VARIABLE(VAR, poly, 16, 8); \
@@ -68386,7 +88513,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DECL_VARIABLE(VAR, float, 16, 8); \
DECL_VARIABLE(VAR, float, 32, 4)
#else
-@@ -480,6 +522,7 @@ static void clean_results (void)
+@@ -480,6 +551,7 @@ static void clean_results (void)
DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR); \
DECL_VARIABLE(VAR, poly, 8, 16); \
DECL_VARIABLE(VAR, poly, 16, 8); \
@@ -68394,7 +88521,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DECL_VARIABLE(VAR, float, 32, 4)
#endif
/* Declare all variants. */
-@@ -500,15 +543,6 @@ static void clean_results (void)
+@@ -500,15 +572,6 @@ static void clean_results (void)
/* Helpers to initialize vectors. */
#define VDUP(VAR, Q, T1, T2, W, N, V) \
VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
@@ -68410,7 +88537,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#define VSET_LANE(VAR, Q, T1, T2, W, N, L, V) \
VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V, \
-@@ -521,6 +555,13 @@ static void clean_results (void)
+@@ -521,6 +584,13 @@ static void clean_results (void)
/* Helpers to call macros with 1 constant and 5 variable
arguments. */
@@ -68424,7 +88551,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR) \
MACRO(VAR, , int, s, 8, 8); \
MACRO(VAR, , int, s, 16, 4); \
-@@ -591,13 +632,15 @@ static void clean_results (void)
+@@ -591,13 +661,15 @@ static void clean_results (void)
TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \
TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2); \
MACRO(VAR1, VAR2, , poly, p, 8, 8); \
@@ -69480,8 +89607,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VBSL(uint, , poly, p, 64, 1);
+ TEST_VBSL(uint, q, poly, p, 64, 2);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vbsl_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vbsl_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vbsl_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vbsl_expected, "");
+
+ /* vceq_p64 tests. */
+#undef TEST_MSG
@@ -69532,7 +89659,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ TEST_VCOMBINE(poly, p, 64, 1, 2);
+
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx16, vcombine_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vcombine_expected, "");
+
+ /* vcreate_p64 tests. */
+#undef TEST_MSG
@@ -69555,7 +89682,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ TEST_VCREATE(poly, p, 64, 1);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vcreate_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vcreate_expected, "");
+
+ /* vdup_lane_p64 tests. */
+#undef TEST_MSG
@@ -69579,8 +89706,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VDUP_LANE(, poly, p, 64, 1, 1, 0);
+ TEST_VDUP_LANE(q, poly, p, 64, 2, 1, 0);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_lane_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_lane_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_lane_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_lane_expected, "");
+
+ /* vdup_n_p64 tests. */
+#undef TEST_MSG
@@ -69604,16 +89731,16 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ switch (i) {
+ case 0:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected0, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected0, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected0, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected0, "");
+ break;
+ case 1:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected1, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected1, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected1, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected1, "");
+ break;
+ case 2:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected2, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected2, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected2, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected2, "");
+ break;
+ default:
+ abort();
@@ -69650,8 +89777,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VEXT(, poly, p, 64, 1, 0);
+ TEST_VEXT(q, poly, p, 64, 2, 1);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vext_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vext_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vext_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vext_expected, "");
+
+ /* vget_low_p64 tests. */
+#undef TEST_MSG
@@ -69671,7 +89798,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ TEST_VGET_LOW(poly, p, 64, 1, 2);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
+
+ /* vget_high_p64 tests. */
+#undef TEST_MSG
@@ -69691,7 +89818,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ TEST_VGET_HIGH(poly, p, 64, 1, 2);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vget_high_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_high_expected, "");
+
+ /* vld1_p64 tests. */
+#undef TEST_MSG
@@ -69713,8 +89840,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VLD1(vld1_vector, buffer, , poly, p, 64, 1);
+ TEST_VLD1(vld1_vector, buffer, q, poly, p, 64, 2);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
+
+ /* vld1_dup_p64 tests. */
+#undef TEST_MSG
@@ -69738,16 +89865,16 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ switch (i) {
+ case 0:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
+ break;
+ case 1:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
+ break;
+ case 2:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
+ break;
+ default:
+ abort();
@@ -69781,8 +89908,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VLD1_LANE(, poly, p, 64, 1, 0);
+ TEST_VLD1_LANE(q, poly, p, 64, 2, 0);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld1_lane_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vld1_lane_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_lane_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_lane_expected, "");
+
+ /* vldX_p64 tests. */
+#define DECL_VLDX(T1, W, N, X) \
@@ -69819,37 +89946,37 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+#define TEST_MSG "VLD2/VLD2Q"
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX(, poly, p, 64, 1, 2);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
+ CLEAN(result, poly, 64, 1);
+ TEST_EXTRA_CHUNK(poly, 64, 1, 2, 1);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD3/VLD3Q"
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX(, poly, p, 64, 1, 3);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
+ CLEAN(result, poly, 64, 1);
+ TEST_EXTRA_CHUNK(poly, 64, 1, 3, 1);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
+ CLEAN(result, poly, 64, 1);
+ TEST_EXTRA_CHUNK(poly, 64, 1, 3, 2);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD4/VLD4Q"
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX(, poly, p, 64, 1, 4);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
+ CLEAN(result, poly, 64, 1);
+ TEST_EXTRA_CHUNK(poly, 64, 1, 4, 1);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
+ CLEAN(result, poly, 64, 1);
+ TEST_EXTRA_CHUNK(poly, 64, 1, 4, 2);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
+ CLEAN(result, poly, 64, 1);
+ TEST_EXTRA_CHUNK(poly, 64, 1, 4, 3);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
+
+ /* vldX_dup_p64 tests. */
+#define DECL_VLDX_DUP(T1, W, N, X) \
@@ -69880,37 +90007,37 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+#define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP(, poly, p, 64, 1, 2);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 2, 1);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP(, poly, p, 64, 1, 3);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 1);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 2);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP(, poly, p, 64, 1, 4);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 1);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 2);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
+ CLEAN(result, poly, 64, 1);
+ TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 3);
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
+
+ /* vsli_p64 tests. */
+#undef TEST_MSG
@@ -69945,8 +90072,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VSXI(vsli, , poly, p, 64, 1, 3);
+ TEST_VSXI(vsli, q, poly, p, 64, 2, 53);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
+
+ /* Test cases with maximum shift amount. */
+ CLEAN(result, poly, 64, 1);
@@ -69956,8 +90083,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VSXI(vsli, q, poly, p, 64, 2, 63);
+
+#define COMMENT "(max shift amount)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
+
+ /* vsri_p64 tests. */
+#undef TEST_MSG
@@ -69975,8 +90102,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VSXI(vsri, , poly, p, 64, 1, 3);
+ TEST_VSXI(vsri, q, poly, p, 64, 2, 53);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected, "");
+
+ /* Test cases with maximum shift amount. */
+ CLEAN(result, poly, 64, 1);
@@ -69986,8 +90113,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VSXI(vsri, q, poly, p, 64, 2, 64);
+
+#define COMMENT "(max shift amount)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected_max_shift, COMMENT);
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected_max_shift, COMMENT);
+
+ /* vst1_lane_p64 tests. */
+#undef TEST_MSG
@@ -70008,8 +90135,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VST1_LANE(, poly, p, 64, 1, 0);
+ TEST_VST1_LANE(q, poly, p, 64, 2, 0);
+
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
+
+#ifdef __aarch64__
+
@@ -70035,16 +90162,16 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ switch (i) {
+ case 0:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected0, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected0, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected0, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected0, "");
+ break;
+ case 1:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected1, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected1, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected1, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected1, "");
+ break;
+ case 2:
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected2, "");
-+ CHECK(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected2, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected2, "");
++ CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected2, "");
+ break;
+ default:
+ abort();
@@ -70064,8 +90191,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_MSG, __FILE__, __LINE__, \
+ STR(VECT_VAR(vget_lane_expected, T1, W, N)), \
+ STR(VECT_NAME(T1, W, N)), \
-+ VECT_VAR(vget_lane_vector, T1, W, N), \
-+ VECT_VAR(vget_lane_expected, T1, W, N)); \
++ (uint##W##_t)VECT_VAR(vget_lane_vector, T1, W, N), \
++ (uint##W##_t)VECT_VAR(vget_lane_expected, T1, W, N)); \
+ abort (); \
+ }
+
@@ -70148,9 +90275,9 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_EXTRA_CHUNK(poly, 64, 1, X, Y) \
+ TEST_EXTRA_CHUNK(poly, 64, 2, X, Y)
+
-+#define CHECK_RESULTS_VLD_STX_LANE(test_name,EXPECTED,comment) \
-+ CHECK(test_name, poly, 64, 1, PRIx64, EXPECTED, comment); \
-+ CHECK(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);
++#define CHECK_RESULTS_VLD_STX_LANE(test_name,EXPECTED,comment) \
++ CHECK_POLY(test_name, poly, 64, 1, PRIx64, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);
+
+ /* Declare the temporary buffers / variables. */
+ DECL_ALL_VLD_STX_LANE(2);
@@ -70192,8 +90319,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ TEST_ALL_EXTRA_CHUNKS(4, 1);
+ CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_1, " chunk 1");
-+ TEST_ALL_EXTRA_CHUNKS(4, 2);
+
++ TEST_ALL_EXTRA_CHUNKS(4, 2);
+ CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_2, " chunk 2");
+
+ TEST_ALL_EXTRA_CHUNKS(4, 3);
@@ -70245,12 +90372,12 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_ALL_VSTX_LANE(2);
+
+#define CMT " (chunk 0)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_0, CMT);
+
+ TEST_ALL_EXTRA_CHUNKS(2, 1);
+#undef CMT
+#define CMT " chunk 1"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_1, CMT);
+
+ /* Check vst3_lane/vst3q_lane. */
+ clean_results ();
@@ -70260,19 +90387,19 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+#undef CMT
+#define CMT " (chunk 0)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_0, CMT);
+
+ TEST_ALL_EXTRA_CHUNKS(3, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_1, CMT);
+
+ TEST_ALL_EXTRA_CHUNKS(3, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_2, CMT);
+
+ /* Check vst4_lane/vst4q_lane. */
+ clean_results ();
@@ -70282,25 +90409,25 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+#undef CMT
+#define CMT " (chunk 0)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_0, CMT);
+
+ TEST_ALL_EXTRA_CHUNKS(4, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_1, CMT);
+
+ TEST_ALL_EXTRA_CHUNKS(4, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_2, CMT);
+
+ TEST_ALL_EXTRA_CHUNKS(4, 3);
+
+#undef CMT
+#define CMT " (chunk 3)"
-+ CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_3, CMT);
++ CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_3, CMT);
+
+#endif /* __aarch64__. */
+
@@ -70515,6 +90642,24 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+ return 0;
+}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_sat_op.inc
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_sat_op.inc
+@@ -61,11 +61,11 @@ void FNNAME (INSN_NAME) (void)
+ TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat, "");
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, int, 16, 4, PRIx8, expected, "");
+- CHECK(TEST_MSG, int, 32, 2, PRIx8, expected, "");
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
++ CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+- CHECK(TEST_MSG, int, 16, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, int, 32, 4, PRIx8, expected, "");
++ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
++ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+
+ #ifdef EXTRA_TESTS
+ EXTRA_TESTS();
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_scalar_op.inc
@@ -0,0 +1,200 @@
@@ -71893,6 +92038,47 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+/* Include the template for binary scalar operations. */
+#include "unary_scalar_op.inc"
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
+@@ -65,10 +65,10 @@ FNNAME (INSN_NAME)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+ }
+
+ int main (void)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
+@@ -93,8 +93,8 @@ void exec_vcombine (void)
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+ #endif
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
+@@ -106,8 +106,8 @@ FNNAME (INSN_NAME)
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+ #endif
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt.c
@@ -4,36 +4,99 @@
@@ -77295,13 +97481,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 2), delem0);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms0_static, "");
+ VECT_VAR (vector_res, float, 64, 2) =
+ vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+ VECT_VAR (vsrc_2, float, 64, 2), delem0);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma0_static, "");
+
+ VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
+ VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
@@ -77312,13 +97498,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 2), delem1);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms1_static, "");
+ VECT_VAR (vector_res, float, 64, 2) =
+ vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+ VECT_VAR (vsrc_2, float, 64, 2), delem1);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma1_static, "");
+
+ VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
+ VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
@@ -77329,13 +97515,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 2), delem2);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms2_static, "");
+ VECT_VAR (vector_res, float, 64, 2) =
+ vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+ VECT_VAR (vsrc_2, float, 64, 2), delem2);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma2_static, "");
+
+ VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
+ VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
@@ -77346,13 +97532,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 2), delem3);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms3_static, "");
+ VECT_VAR (vector_res, float, 64, 2) =
+ vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
+ VECT_VAR (vsrc_2, float, 64, 2), delem3);
+ vst1q_f64 (VECT_VAR (result, float, 64, 2),
+ VECT_VAR (vector_res, float, 64, 2));
-+ CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma3_static, "");
+
+#undef TEST_MSG
+#define TEST_MSG "VFMS_VFMA_N (FP64)"
@@ -77369,13 +97555,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 1), delem0);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms0_static, "");
+ VECT_VAR (vector_res, float, 64, 1) =
+ vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+ VECT_VAR (vsrc_2, float, 64, 1), delem0);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma0_static, "");
+
+ VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
+ VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
@@ -77386,13 +97572,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 1), delem1);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms1_static, "");
+ VECT_VAR (vector_res, float, 64, 1) =
+ vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+ VECT_VAR (vsrc_2, float, 64, 1), delem1);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma1_static, "");
+
+ VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
+ VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
@@ -77403,13 +97589,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 1), delem2);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms2_static, "");
+ VECT_VAR (vector_res, float, 64, 1) =
+ vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+ VECT_VAR (vsrc_2, float, 64, 1), delem2);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma2_static, "");
+
+ VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
+ VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
@@ -77420,13 +97606,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR (vsrc_2, float, 64, 1), delem3);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms3_static, "");
+ VECT_VAR (vector_res, float, 64, 1) =
+ vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
+ VECT_VAR (vsrc_2, float, 64, 1), delem3);
+ vst1_f64 (VECT_VAR (result, float, 64, 1),
+ VECT_VAR (vector_res, float, 64, 1));
-+ CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, "");
++ CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma3_static, "");
+}
+#endif
+
@@ -77481,6 +97667,19 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+/* Include the template for binary scalar operations. */
+#include "ternary_scalar_op.inc"
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
+@@ -63,8 +63,8 @@ void exec_vget_high (void)
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
+ }
+
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
@@ -13,6 +13,7 @@ uint32_t expected_u32 = 0xfffffff1;
@@ -77550,6 +97749,19 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_VGET_LANE_FP(q, float, f, 32, 4, 3);
}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
+@@ -63,8 +63,8 @@ void exec_vget_low (void)
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+ #endif
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
@@ -77610,6 +97822,63 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
float16x8x4_t
f_vld4q_lane_f16 (float16_t * p, float16x8x4_t v)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
+@@ -528,8 +528,8 @@ void exec_vldX (void)
+ CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
+ CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment); \
+ \
+ CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment); \
+@@ -538,8 +538,8 @@ void exec_vldX (void)
+ CHECK(test_name, uint, 8, 16, PRIx8, EXPECTED, comment); \
+ CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
+@@ -270,8 +270,8 @@ void exec_vldX_dup (void)
+ CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
+ CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
+@@ -451,14 +451,14 @@ void exec_vldX_lane (void)
+ CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment); \
+ CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment); \
+ CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment); \
+ CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
+
+ #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmax.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmax.c
@@ -7,6 +7,10 @@
@@ -78539,6 +98808,24 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ exec_vminv_f16 ();
+ return 0;
+}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn.c
+@@ -35,11 +35,11 @@ void exec_vmovn (void)
+ TEST_VMOVN(uint, u, 32, 16, 4);
+ TEST_VMOVN(uint, u, 64, 32, 2);
+
+- CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
+- CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
++ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+ }
+
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
@@ -13,6 +13,10 @@ VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfab0, 0xfb05, 0xfb5a, 0xfbaf };
@@ -78621,9 +98908,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-@@ -152,6 +181,10 @@ void FNNAME (INSN_NAME) (void)
+@@ -142,7 +171,7 @@ void FNNAME (INSN_NAME) (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+@@ -150,8 +179,12 @@ void FNNAME (INSN_NAME) (void)
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
@@ -78721,17 +99020,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+#endif
TEST_VMUL_LANE(q, float, f, 32, 4, 2, 0);
- CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
+- CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+#endif
CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
- CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
+- CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
- CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
+- CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
@@ -79268,17 +99571,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+#endif
TEST_VMUL_N(q, float, f, 32, 4, 88.9f);
- CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
+- CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
+#endif
CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
- CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
+- CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
- CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
+- CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
@@ -79424,6 +99731,41 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ exec_vmulh_lane_f16 ();
+ return 0;
+}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull.c
+@@ -59,13 +59,13 @@ void exec_vmull (void)
+ TEST_VMULL(uint, u, 32, 64, 2);
+ TEST_VMULL(poly, p, 8, 16, 8);
+
+- CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+- CHECK(TEST_MSG, int, 64, 2, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 64, 2, PRIx32, expected, "");
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
++ CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+ }
+
+ int main (void)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_lane.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_lane.c
+@@ -54,9 +54,9 @@ void exec_vmull_lane (void)
+ TEST_VMULL_LANE(uint, u, 32, 64, 2, 1);
+
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+- CHECK(TEST_MSG, int, 64, 2, PRIx32, expected, "");
++ CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 64, 2, PRIx32, expected, "");
++ CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
+ }
+
+ int main (void)
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_f16_1.c
@@ -0,0 +1,84 @@
@@ -80293,6 +100635,25 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ exec_vmulxh_lane_f16 ();
+ return 0;
+}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmvn.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmvn.c
+@@ -120,14 +120,14 @@ FNNAME (INSN_NAME)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+ }
+
+ int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
@@ -21,24 +21,53 @@ VECT_VAR_DECL(expected,int,32,4) [] = { 0x10, 0xf, 0xe, 0xd };
@@ -80423,7 +100784,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
VLOAD(vector, buffer, , float, f, 32, 2);
/* Apply a binary operator named INSN_NAME. */
-@@ -49,6 +58,9 @@ void FNNAME (INSN_NAME) (void)
+@@ -49,14 +58,20 @@ void FNNAME (INSN_NAME) (void)
TEST_VPXXX(INSN_NAME, uint, u, 8, 8);
TEST_VPXXX(INSN_NAME, uint, u, 16, 4);
TEST_VPXXX(INSN_NAME, uint, u, 32, 2);
@@ -80432,10 +100793,15 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+#endif
TEST_VPXXX(INSN_NAME, float, f, 32, 2);
- CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
-@@ -57,6 +69,9 @@ void FNNAME (INSN_NAME) (void)
- CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
+- CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
+- CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
+- CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
++ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
++ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
@@ -80596,6 +100962,192 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ exec_vpminmaxnm_f16 ();
+ return 0;
+}
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqabs.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqabs.c
+@@ -90,9 +90,9 @@ void vqabs_extra()
+ TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat_min_neg, MSG);
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 16, 4, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 32, 2, PRIx8, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_min_neg, MSG);
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 16, 8, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 32, 4, PRIx8, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_min_neg, MSG);
+ }
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqdmull.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqdmull.c
+@@ -63,8 +63,8 @@ void FNNAME (INSN_NAME) (void)
+ TEST_VQDMULL(int, s, 16, 32, 4, expected_cumulative_sat, "");
+ TEST_VQDMULL(int, s, 32, 64, 2, expected_cumulative_sat, "");
+
+- CHECK (TEST_MSG, int, 32, 4, PRIx16, expected, "");
+- CHECK (TEST_MSG, int, 64, 2, PRIx32, expected, "");
++ CHECK (TEST_MSG, int, 32, 4, PRIx32, expected, "");
++ CHECK (TEST_MSG, int, 64, 2, PRIx64, expected, "");
+
+ VDUP(vector, , int, s, 16, 4, 0x8000);
+ VDUP(vector2, , int, s, 16, 4, 0x8000);
+@@ -75,8 +75,8 @@ void FNNAME (INSN_NAME) (void)
+ TEST_VQDMULL(int, s, 16, 32, 4, expected_cumulative_sat2, TEST_MSG2);
+ TEST_VQDMULL(int, s, 32, 64, 2, expected_cumulative_sat2, TEST_MSG2);
+
+- CHECK (TEST_MSG, int, 32, 4, PRIx16, expected2, TEST_MSG2);
+- CHECK (TEST_MSG, int, 64, 2, PRIx32, expected2, TEST_MSG2);
++ CHECK (TEST_MSG, int, 32, 4, PRIx32, expected2, TEST_MSG2);
++ CHECK (TEST_MSG, int, 64, 2, PRIx64, expected2, TEST_MSG2);
+ }
+
+ int main (void)
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqneg.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqneg.c
+@@ -90,9 +90,9 @@ void vqneg_extra()
+ TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat_min_neg, MSG);
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 16, 4, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 32, 2, PRIx8, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_min_neg, MSG);
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 16, 8, PRIx8, expected_min_neg, MSG);
+- CHECK(TEST_MSG, int, 32, 4, PRIx8, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_min_neg, MSG);
++ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_min_neg, MSG);
+ }
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqtbX.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqtbX.c
+@@ -318,13 +318,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl1, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl1, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl1, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl1, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBL1Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl1q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl1q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl1q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl1q, "");
+
+ /* Check vqtbl2. */
+ clean_results ();
+@@ -334,13 +334,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl2, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl2, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl2, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl2, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBL2Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl2q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl2q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl2q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl2q, "");
+
+ /* Check vqtbl3. */
+ clean_results ();
+@@ -350,13 +350,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl3, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl3, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl3, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl3, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBL3Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl3q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl3q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl3q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl3q, "");
+
+ /* Check vqtbl4. */
+ clean_results ();
+@@ -366,13 +366,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl4, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl4, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl4, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl4, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBL4Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl4q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl4q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl4q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl4q, "");
+
+
+ /* Now test VQTBX. */
+@@ -455,13 +455,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx1, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx1, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx1, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx1, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBX1Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx1q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx1q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx1q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx1q, "");
+
+ /* Check vqtbx2. */
+ clean_results ();
+@@ -471,13 +471,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx2, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx2, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx2, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx2, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBX2Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx2q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx2q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx2q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx2q, "");
+
+ /* Check vqtbx3. */
+ clean_results ();
+@@ -487,13 +487,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx3, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx3, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx3, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx3, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBX3Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx3q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx3q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx3q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx3q, "");
+
+ /* Check vqtbx4. */
+ clean_results ();
+@@ -503,13 +503,13 @@ void exec_vqtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx4, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx4, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx4, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx4, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VQTBX4Q"
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx4q, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx4q, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx4q, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx4q, "");
+ }
+
+ int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpe.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpe.c
@@ -7,6 +7,14 @@
@@ -81919,7 +102471,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS))); \
+ vst1##Q##_##T2##64(VECT_VAR(result, poly, 64, 2), \
+ VECT_VAR(vreint_vector_res, poly, 64, 2)); \
-+ CHECK(TEST_MSG, T1, 64, 2, PRIx##64, EXPECTED, "");
++ CHECK_POLY(TEST_MSG, T1, 64, 2, PRIx##64, EXPECTED, "");
+
+ TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 8, 16, vreint_expected_q_p128_s8);
+ TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 16, 8, vreint_expected_q_p128_s16);
@@ -81980,7 +102532,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p64.c
-@@ -0,0 +1,209 @@
+@@ -0,0 +1,216 @@
+/* This file contains tests for the vreinterpret *p64 intrinsics. */
+
+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
@@ -82097,6 +102649,13 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ VECT_VAR(vreint_vector_res, T1, W, N)); \
+ CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
+
++#define TEST_VREINTERPRET_TO_POLY(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
++ VECT_VAR(vreint_vector_res, T1, W, N) = \
++ vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
++ vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), \
++ VECT_VAR(vreint_vector_res, T1, W, N)); \
++ CHECK_POLY(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
++
+#define TEST_VREINTERPRET_FP(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
+ VECT_VAR(vreint_vector_res, T1, W, N) = \
+ vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
@@ -82122,38 +102681,38 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ /* vreinterpret_p64_* tests. */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRET_P64_*"
-+ TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 8, 8, vreint_expected_p64_s8);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 16, 4, vreint_expected_p64_s16);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 32, 2, vreint_expected_p64_s32);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, int, s, 64, 1, vreint_expected_p64_s64);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 8, 8, vreint_expected_p64_u8);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 16, 4, vreint_expected_p64_u16);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 32, 2, vreint_expected_p64_u32);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, uint, u, 64, 1, vreint_expected_p64_u64);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, poly, p, 8, 8, vreint_expected_p64_p8);
-+ TEST_VREINTERPRET(, poly, p, 64, 1, poly, p, 16, 4, vreint_expected_p64_p16);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 8, 8, vreint_expected_p64_s8);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 16, 4, vreint_expected_p64_s16);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 32, 2, vreint_expected_p64_s32);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 64, 1, vreint_expected_p64_s64);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 8, 8, vreint_expected_p64_u8);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 16, 4, vreint_expected_p64_u16);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 32, 2, vreint_expected_p64_u32);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 64, 1, vreint_expected_p64_u64);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, poly, p, 8, 8, vreint_expected_p64_p8);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, poly, p, 16, 4, vreint_expected_p64_p16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+ TEST_VREINTERPRET(, poly, p, 64, 1, float, f, 16, 4, vreint_expected_p64_f16);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, float, f, 16, 4, vreint_expected_p64_f16);
+#endif
-+ TEST_VREINTERPRET(, poly, p, 64, 1, float, f, 32, 2, vreint_expected_p64_f32);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, float, f, 32, 2, vreint_expected_p64_f32);
+
+ /* vreinterpretq_p64_* tests. */
+#undef TEST_MSG
+#define TEST_MSG "VREINTERPRETQ_P64_*"
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 8, 16, vreint_expected_q_p64_s8);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 16, 8, vreint_expected_q_p64_s16);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 32, 4, vreint_expected_q_p64_s32);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, int, s, 64, 2, vreint_expected_q_p64_s64);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 8, 16, vreint_expected_q_p64_u8);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 16, 8, vreint_expected_q_p64_u16);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 32, 4, vreint_expected_q_p64_u32);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, uint, u, 64, 2, vreint_expected_q_p64_u64);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, poly, p, 8, 16, vreint_expected_q_p64_p8);
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, poly, p, 16, 8, vreint_expected_q_p64_p16);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 8, 16, vreint_expected_q_p64_s8);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 16, 8, vreint_expected_q_p64_s16);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 32, 4, vreint_expected_q_p64_s32);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 64, 2, vreint_expected_q_p64_s64);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 8, 16, vreint_expected_q_p64_u8);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 16, 8, vreint_expected_q_p64_u16);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 32, 4, vreint_expected_q_p64_u32);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 64, 2, vreint_expected_q_p64_u64);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, poly, p, 8, 16, vreint_expected_q_p64_p8);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, poly, p, 16, 8, vreint_expected_q_p64_p16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, float, f, 16, 8, vreint_expected_q_p64_f16);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, float, f, 16, 8, vreint_expected_q_p64_f16);
+#endif
-+ TEST_VREINTERPRET(q, poly, p, 64, 2, float, f, 32, 4, vreint_expected_q_p64_f32);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, float, f, 32, 4, vreint_expected_q_p64_f32);
+
+ /* vreinterpret_*_p64 tests. */
+#undef TEST_MSG
@@ -82167,8 +102726,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 64, 1, vreint_expected_u16_p64);
+ TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 64, 1, vreint_expected_u32_p64);
+ TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 64, 1, vreint_expected_u64_p64);
-+ TEST_VREINTERPRET(, poly, p, 8, 8, poly, p, 64, 1, vreint_expected_p8_p64);
-+ TEST_VREINTERPRET(, poly, p, 16, 4, poly, p, 64, 1, vreint_expected_p16_p64);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 8, 8, poly, p, 64, 1, vreint_expected_p8_p64);
++ TEST_VREINTERPRET_TO_POLY(, poly, p, 16, 4, poly, p, 64, 1, vreint_expected_p16_p64);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 64, 1, vreint_expected_f16_p64);
+#endif
@@ -82181,8 +102740,8 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 64, 2, vreint_expected_q_u16_p64);
+ TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 64, 2, vreint_expected_q_u32_p64);
+ TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 64, 2, vreint_expected_q_u64_p64);
-+ TEST_VREINTERPRET(q, poly, p, 8, 16, poly, p, 64, 2, vreint_expected_q_p8_p64);
-+ TEST_VREINTERPRET(q, poly, p, 16, 8, poly, p, 64, 2, vreint_expected_q_p16_p64);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 8, 16, poly, p, 64, 2, vreint_expected_q_p8_p64);
++ TEST_VREINTERPRET_TO_POLY(q, poly, p, 16, 8, poly, p, 64, 2, vreint_expected_q_p16_p64);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 64, 2, vreint_expected_q_f16_p64);
+#endif
@@ -82227,9 +102786,56 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
VLOAD(vector, buffer, , float, f, 32, 2);
VLOAD(vector, buffer, q, float, f, 32, 4);
-@@ -187,6 +201,12 @@ void exec_vrev (void)
- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
+@@ -118,10 +132,10 @@ void exec_vrev (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vrev16, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev16, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev16, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev16, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev16, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev16, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VREV32"
+@@ -142,14 +156,14 @@ void exec_vrev (void)
+ CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_vrev32, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev32, "");
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev32, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev32, "");
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev32, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev32, "");
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev32, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
+
+ #undef TEST_MSG
+ #define TEST_MSG "VREV64"
+@@ -176,17 +190,23 @@ void exec_vrev (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev64, "");
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev64, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_vrev64, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev64, "");
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev64, "");
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_vrev64, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev64, "");
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev64, "");
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_vrev64, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
+#if defined (FP16_SUPPORTED)
+ TEST_VREV (, float, f, 16, 4, 64);
@@ -83324,6 +103930,29 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+/* Include the template for unary scalar operations. */
+#include "binary_scalar_op.inc"
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsXi_n.inc
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsXi_n.inc
+@@ -76,16 +76,16 @@ void FNNAME (INSN_NAME) (void)
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+
+ #ifdef EXTRA_TESTS
+ EXTRA_TESTS();
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
@@ -101,10 +101,8 @@ VECT_VAR_DECL(expected_negative_shift,uint,64,2) [] = { 0x7ffffffffffffff,
@@ -83391,9 +104020,25 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#define TEST_ALL_EXTRA_CHUNKS() \
TEST_EXTRA_CHUNK(int, 8, 8, 1); \
TEST_EXTRA_CHUNK(int, 16, 4, 1); \
-@@ -143,17 +161,37 @@ void FNNAME (INSN_NAME) (void)
- CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
- CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
+@@ -130,8 +148,8 @@ void FNNAME (INSN_NAME) (void)
+ CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment); \
+ CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment); \
+ \
+ CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment); \
+@@ -140,20 +158,40 @@ void FNNAME (INSN_NAME) (void)
+ CHECK(test_name, uint, 8, 16, PRIx8, EXPECTED, comment); \
+ CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment); \
+ CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment); \
+- CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
+- CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment); \
++ CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment); \
- } \
+ }
@@ -83432,7 +104077,14 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
-@@ -166,9 +166,11 @@ void vsli_extra(void)
+@@ -161,14 +161,16 @@ void vsli_extra(void)
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
@@ -83440,9 +104092,11 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected_max_shift, COMMENT);
- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
}
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsqrt_f16_1.c
@@ -83562,6 +104216,27 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+/* Include the template for unary scalar operations. */
+#include "unary_scalar_op.inc"
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
+@@ -163,14 +163,14 @@ void vsri_extra(void)
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
+ CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
+ }
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_f16_indices_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_f16_indices_1.c
@@ -2,6 +2,7 @@
@@ -83891,9 +104566,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
/* Check vst2_lane/vst2q_lane. */
-@@ -400,6 +469,10 @@ void exec_vstX_lane (void)
+@@ -391,15 +460,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st2_0, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st2_0, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_0, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_0, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_0, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_0, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st2_0, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_0, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_0, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_0, CMT);
@@ -83902,9 +104589,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_ALL_EXTRA_CHUNKS(2, 1);
#undef CMT
-@@ -419,6 +492,10 @@ void exec_vstX_lane (void)
+@@ -410,15 +483,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st2_1, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st2_1, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_1, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_1, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_1, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_1, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st2_1, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_1, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_1, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_1, CMT);
@@ -83913,9 +104612,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Check vst3_lane/vst3q_lane. */
-@@ -444,6 +521,10 @@ void exec_vstX_lane (void)
+@@ -435,15 +512,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_0, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_0, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_0, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_0, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_0, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_0, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_0, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_0, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_0, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_0, CMT);
@@ -83924,9 +104635,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_ALL_EXTRA_CHUNKS(3, 1);
-@@ -464,6 +545,10 @@ void exec_vstX_lane (void)
+@@ -455,15 +536,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_1, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_1, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_1, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_1, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_1, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_1, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_1, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_1, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_1, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_1, CMT);
@@ -83935,9 +104658,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_ALL_EXTRA_CHUNKS(3, 2);
-@@ -484,6 +569,10 @@ void exec_vstX_lane (void)
+@@ -475,15 +560,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_2, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_2, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_2, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_2, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_2, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_2, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_2, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_2, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_2, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_2, CMT);
@@ -83946,9 +104681,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Check vst4_lane/vst4q_lane. */
-@@ -509,6 +598,10 @@ void exec_vstX_lane (void)
+@@ -500,15 +589,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_0, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_0, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_0, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_0, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_0, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_0, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_0, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_0, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_0, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_0, CMT);
@@ -83957,9 +104704,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_ALL_EXTRA_CHUNKS(4, 1);
-@@ -529,6 +622,10 @@ void exec_vstX_lane (void)
+@@ -520,15 +613,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_1, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_1, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_1, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_1, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_1, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_1, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_1, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_1, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_1, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_1, CMT);
@@ -83968,9 +104727,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_ALL_EXTRA_CHUNKS(4, 2);
-@@ -549,6 +646,10 @@ void exec_vstX_lane (void)
+@@ -540,15 +637,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_2, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_2, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_2, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_2, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_2, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_2, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_2, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_2, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_2, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_2, CMT);
@@ -83979,9 +104750,21 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
TEST_ALL_EXTRA_CHUNKS(4, 3);
-@@ -569,6 +670,10 @@ void exec_vstX_lane (void)
+@@ -560,15 +661,19 @@ void exec_vstX_lane (void)
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_3, CMT);
+ CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_3, CMT);
+ CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_3, CMT);
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
+- CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
+ CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_3, CMT);
+ CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_3, CMT);
+ CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_3, CMT);
+ CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_3, CMT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_3, CMT);
- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
+- CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
++ CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_3, CMT);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+ CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_3, CMT);
@@ -84080,6 +104863,80 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+/* Include the template for binary scalar operations. */
+#include "binary_scalar_op.inc"
+--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
+@@ -167,7 +167,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl1, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl1, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
+
+ /* Check vtbl2. */
+ clean_results ();
+@@ -177,7 +177,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl2, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl2, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
+
+ /* Check vtbl3. */
+ clean_results ();
+@@ -187,7 +187,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl3, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl3, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
+
+ /* Check vtbl4. */
+ clean_results ();
+@@ -197,7 +197,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl4, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl4, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
+
+
+ /* Now test VTBX. */
+@@ -249,7 +249,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx1, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx1, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
+
+ /* Check vtbx2. */
+ clean_results ();
+@@ -259,7 +259,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx2, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx2, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
+
+ /* Check vtbx3. */
+ clean_results ();
+@@ -269,7 +269,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx3, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx3, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
+
+ /* Check vtbx4. */
+ clean_results ();
+@@ -279,7 +279,7 @@ void exec_vtbX (void)
+
+ CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx4, "");
+ CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx4, "");
+- CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
++ CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
+ }
+
+ int main (void)
--- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
@@ -15,6 +15,10 @@ VECT_VAR_DECL(expected0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
@@ -85133,6 +105990,15 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* { dg-options "-O2 -mcpu=+dummy" } */
void f ()
+--- a/src/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-O2 -ftree-vectorize -fno-inline -save-temps" } */
++/* { dg-options "-O2 -ftree-vectorize -fno-inline -fno-vect-cost-model -save-temps" } */
+
+
+ extern void abort (void);
--- a/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -110,6 +110,6 @@ main (int argc, char **argv)
@@ -85169,6 +106035,15 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
-/* { dg-final { scan-assembler "fmov\\ts0, wzr" } } */
+/* { dg-final { scan-assembler "movi\\tv0\.2s, #0" } } */
+--- a/src/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline" } */
++/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline -fno-vect-cost-model" } */
+
+ #define N 1024
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_subreg_1.c
@@ -0,0 +1,30 @@
@@ -85202,6 +106077,14 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+}
+
+/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_convert_multiple_sets" "ce1" } } */
+--- a/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
+@@ -1,4 +1,4 @@
+-/* { dg-options "-O2" } */
++/* { dg-options "-O2 -mcpu=generic" } */
+
+ int arr[4][4];
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_unaligned_1.c
@@ -0,0 +1,20 @@
@@ -85942,6 +106825,15 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ return 0;
+}
+
+--- a/src/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
++/* { dg-options "-O2 -mcpu=generic" } */
+
+ int f(int *a, int b)
+ {
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/struct_return.c
@@ -0,0 +1,31 @@
@@ -86194,6 +107086,39 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+ to do ldp. */
+/* { dg-final { scan-assembler-not "ldp\tw\[0-9\]+, w\[0-9\]" } } */
--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/ubfiz_lsl_1.c
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++/* Check that an X-reg UBFIZ can be simplified into a W-reg LSL. */
++
++long long
++f2 (long long x)
++{
++ return (x << 5) & 0xffffffff;
++}
++
++/* { dg-final { scan-assembler "lsl\tw" } } */
++/* { dg-final { scan-assembler-not "ubfiz\tx" } } */
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/aarch64/ubfx_lsr_1.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++/* Check that an X-reg UBFX can be simplified into a W-reg LSR. */
++
++int
++f (unsigned long long x)
++{
++ x = (x >> 24) & 255;
++ return x + 1;
++}
++
++/* { dg-final { scan-assembler "lsr\tw" } } */
++/* { dg-final { scan-assembler-not "ubfx\tx" } } */
+--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
@@ -86257,6 +107182,88 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+}
+
+/* { dg-final { cleanup-saved-temps } } */
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
+@@ -1,6 +1,6 @@
+
+ /* { dg-do compile } */
+-/* { dg-options "-O3" } */
++/* { dg-options "-O3 -fno-vect-cost-model" } */
+
+ #define N 16
+
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-clz.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-clz.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-O3 -save-temps -fno-inline" } */
++/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
+
+ extern void abort ();
+
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
+
+ #define FTYPE double
+ #define ITYPE long
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
+
+ #define FTYPE double
+ #define ITYPE long
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
+
+ #define FTYPE double
+ #define ITYPE long
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
+
+ #define N 32
+
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
+
+ #define N 32
+
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf-zero.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf-zero.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
+
+ #define N 32
+
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
+
+ #define N 32
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
@@ -0,0 +1,86 @@
@@ -86346,6 +107353,15 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+BUILD_TEST (int64x2_t, int64x2_t, q, q, s64, 1, 1)
+BUILD_TEST (uint64x2_t, uint64x2_t, q, q, u64, 1, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */
+--- a/src/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
++++ b/src/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
+@@ -1,5 +1,5 @@
+ /* { dg-do run } */
+-/* { dg-options "-O3 -save-temps -fno-inline" } */
++/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
+
+ extern void abort ();
+
--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
@@ -0,0 +1,72 @@
@@ -136655,6 +157671,25 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
+
+/* Tidy up. */
--- /dev/null
++++ b/src/gcc/testsuite/gcc.target/arm/pr79145.c
+@@ -0,0 +1,16 @@
++/* { dg-do compile } */
++/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mcpu=*" } { "-mcpu=iwmmxt" } } */
++/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mabi=*" } { "-mabi=iwmmxt" } } */
++/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-march=*" } { "-march=iwmmxt" } } */
++/* { dg-skip-if "Test is specific to ARM mode" { arm*-*-* } { "-mthumb" } { "" } } */
++/* { dg-require-effective-target arm32 } */
++/* { dg-require-effective-target arm_iwmmxt_ok } */
++/* { dg-options "-mcpu=iwmmxt" } */
++
++int
++main (void)
++{
++ volatile long long t1;
++ t1 ^= 0x55;
++ return 0;
++}
+--- /dev/null
+++ b/src/gcc/testsuite/gcc.target/arm/short-vfp-1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
@@ -138920,7 +159955,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
{
set_value_range_to_null (vr, type);
return;
-@@ -9919,6 +9938,40 @@ simplify_internal_call_using_ranges (gimple_stmt_iterator *gsi, gimple *stmt)
+@@ -9917,6 +9936,40 @@ simplify_internal_call_using_ranges (gimple_stmt_iterator *gsi, gimple *stmt)
return true;
}
@@ -138961,7 +159996,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* Simplify STMT using ranges if possible. */
static bool
-@@ -9929,6 +9982,68 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
+@@ -9927,6 +9980,68 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
{
enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
tree rhs1 = gimple_assign_rhs1 (stmt);
@@ -139156,20 +160191,6 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
# The floating-point conversion routines that involve a single-word integer.
# XX stands for the integer mode.
---- a/src/libgcc/config.host
-+++ b/src/libgcc/config.host
-@@ -333,6 +333,11 @@ aarch64*-*-elf | aarch64*-*-rtems*)
- tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
- tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
- ;;
-+aarch64*-*-freebsd*)
-+ extra_parts="$extra_parts crtfastmath.o"
-+ tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
-+ tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
-+ ;;
- aarch64*-*-linux*)
- extra_parts="$extra_parts crtfastmath.o"
- md_unwind_header=aarch64/linux-unwind.h
--- a/src/libgcc/config/arm/bpabi-v6m.S
+++ b/src/libgcc/config/arm/bpabi-v6m.S
@@ -1,4 +1,5 @@
@@ -139518,16 +160539,49 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
/* How to return from a function call depends on the architecture variant. */
#if (__ARM_ARCH__ > 4) || defined(__ARM_ARCH_4T__)
-@@ -305,7 +310,7 @@ LSYM(Lend_fde):
+@@ -305,35 +310,14 @@ LSYM(Lend_fde):
#ifdef __ARM_EABI__
.macro THUMB_LDIV0 name signed
-#if defined(__ARM_ARCH_6M__)
+- .ifc \signed, unsigned
+- cmp r0, #0
+- beq 1f
+- mov r0, #0
+- mvn r0, r0 @ 0xffffffff
+-1:
+- .else
+- cmp r0, #0
+- beq 2f
+- blt 3f
+#ifdef NOT_ISA_TARGET_32BIT
++
++ push {r0, lr}
+ mov r0, #0
+- mvn r0, r0
+- lsr r0, r0, #1 @ 0x7fffffff
+- b 2f
+-3: mov r0, #0x80
+- lsl r0, r0, #24 @ 0x80000000
+-2:
+- .endif
+- push {r0, r1, r2}
+- ldr r0, 4f
+- adr r1, 4f
+- add r0, r1
+- str r0, [sp, #8]
++ bl SYM(__aeabi_idiv0)
+ @ We know we are not on armv4t, so pop pc is safe.
+- pop {r0, r1, pc}
+- .align 2
+-4:
+- .word __aeabi_idiv0 - 4b
++ pop {r1, pc}
++
+ #elif defined(__thumb2__)
+ .syntax unified
.ifc \signed, unsigned
- cmp r0, #0
- beq 1f
-@@ -478,7 +483,7 @@ _L__\name:
+@@ -478,7 +462,7 @@ _L__\name:
#else /* !(__INTERWORKING_STUBS__ || __thumb2__) */
@@ -139536,7 +160590,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#define EQUIV .thumb_set
#else
.macro ARM_FUNC_START name sp_section=
-@@ -510,7 +515,7 @@ SYM (__\name):
+@@ -510,7 +494,7 @@ SYM (__\name):
#endif
.endm
@@ -139545,7 +160599,224 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
.macro ARM_FUNC_ALIAS new old
.globl SYM (__\new)
EQUIV SYM (__\new), SYM (__\old)
-@@ -1054,7 +1059,7 @@ ARM_FUNC_START aeabi_uidivmod
+@@ -945,7 +929,170 @@ LSYM(Lover7):
+ add dividend, work
+ .endif
+ LSYM(Lgot_result):
+-.endm
++.endm
++
++/* If performance is preferred, the following functions are provided. */
++#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
++
++/* Branch to div(n), and jump to label if curbit is lo than divisior. */
++.macro BranchToDiv n, label
++ lsr curbit, dividend, \n
++ cmp curbit, divisor
++ blo \label
++.endm
++
++/* Body of div(n). Shift the divisor in n bits and compare the divisor
++ and dividend. Update the dividend as the substruction result. */
++.macro DoDiv n
++ lsr curbit, dividend, \n
++ cmp curbit, divisor
++ bcc 1f
++ lsl curbit, divisor, \n
++ sub dividend, dividend, curbit
++
++1: adc result, result
++.endm
++
++/* The body of division with positive divisor. Unless the divisor is very
++ big, shift it up in multiples of four bits, since this is the amount of
++ unwinding in the main division loop. Continue shifting until the divisor
++ is larger than the dividend. */
++.macro THUMB1_Div_Positive
++ mov result, #0
++ BranchToDiv #1, LSYM(Lthumb1_div1)
++ BranchToDiv #4, LSYM(Lthumb1_div4)
++ BranchToDiv #8, LSYM(Lthumb1_div8)
++ BranchToDiv #12, LSYM(Lthumb1_div12)
++ BranchToDiv #16, LSYM(Lthumb1_div16)
++LSYM(Lthumb1_div_large_positive):
++ mov result, #0xff
++ lsl divisor, divisor, #8
++ rev result, result
++ lsr curbit, dividend, #16
++ cmp curbit, divisor
++ blo 1f
++ asr result, #8
++ lsl divisor, divisor, #8
++ beq LSYM(Ldivbyzero_waypoint)
++
++1: lsr curbit, dividend, #12
++ cmp curbit, divisor
++ blo LSYM(Lthumb1_div12)
++ b LSYM(Lthumb1_div16)
++LSYM(Lthumb1_div_loop):
++ lsr divisor, divisor, #8
++LSYM(Lthumb1_div16):
++ Dodiv #15
++ Dodiv #14
++ Dodiv #13
++ Dodiv #12
++LSYM(Lthumb1_div12):
++ Dodiv #11
++ Dodiv #10
++ Dodiv #9
++ Dodiv #8
++ bcs LSYM(Lthumb1_div_loop)
++LSYM(Lthumb1_div8):
++ Dodiv #7
++ Dodiv #6
++ Dodiv #5
++LSYM(Lthumb1_div5):
++ Dodiv #4
++LSYM(Lthumb1_div4):
++ Dodiv #3
++LSYM(Lthumb1_div3):
++ Dodiv #2
++LSYM(Lthumb1_div2):
++ Dodiv #1
++LSYM(Lthumb1_div1):
++ sub divisor, dividend, divisor
++ bcs 1f
++ cpy divisor, dividend
++
++1: adc result, result
++ cpy dividend, result
++ RET
++
++LSYM(Ldivbyzero_waypoint):
++ b LSYM(Ldiv0)
++.endm
++
++/* The body of division with negative divisor. Similar with
++ THUMB1_Div_Positive except that the shift steps are in multiples
++ of six bits. */
++.macro THUMB1_Div_Negative
++ lsr result, divisor, #31
++ beq 1f
++ neg divisor, divisor
++
++1: asr curbit, dividend, #32
++ bcc 2f
++ neg dividend, dividend
++
++2: eor curbit, result
++ mov result, #0
++ cpy ip, curbit
++ BranchToDiv #4, LSYM(Lthumb1_div_negative4)
++ BranchToDiv #8, LSYM(Lthumb1_div_negative8)
++LSYM(Lthumb1_div_large):
++ mov result, #0xfc
++ lsl divisor, divisor, #6
++ rev result, result
++ lsr curbit, dividend, #8
++ cmp curbit, divisor
++ blo LSYM(Lthumb1_div_negative8)
++
++ lsl divisor, divisor, #6
++ asr result, result, #6
++ cmp curbit, divisor
++ blo LSYM(Lthumb1_div_negative8)
++
++ lsl divisor, divisor, #6
++ asr result, result, #6
++ cmp curbit, divisor
++ blo LSYM(Lthumb1_div_negative8)
++
++ lsl divisor, divisor, #6
++ beq LSYM(Ldivbyzero_negative)
++ asr result, result, #6
++ b LSYM(Lthumb1_div_negative8)
++LSYM(Lthumb1_div_negative_loop):
++ lsr divisor, divisor, #6
++LSYM(Lthumb1_div_negative8):
++ DoDiv #7
++ DoDiv #6
++ DoDiv #5
++ DoDiv #4
++LSYM(Lthumb1_div_negative4):
++ DoDiv #3
++ DoDiv #2
++ bcs LSYM(Lthumb1_div_negative_loop)
++ DoDiv #1
++ sub divisor, dividend, divisor
++ bcs 1f
++ cpy divisor, dividend
++
++1: cpy curbit, ip
++ adc result, result
++ asr curbit, curbit, #1
++ cpy dividend, result
++ bcc 2f
++ neg dividend, dividend
++ cmp curbit, #0
++
++2: bpl 3f
++ neg divisor, divisor
++
++3: RET
++
++LSYM(Ldivbyzero_negative):
++ cpy curbit, ip
++ asr curbit, curbit, #1
++ bcc LSYM(Ldiv0)
++ neg dividend, dividend
++.endm
++#endif /* ARM Thumb version. */
++
+ /* ------------------------------------------------------------------------ */
+ /* Start of the Real Functions */
+ /* ------------------------------------------------------------------------ */
+@@ -955,6 +1102,7 @@ LSYM(Lgot_result):
+
+ FUNC_START udivsi3
+ FUNC_ALIAS aeabi_uidiv udivsi3
++#if defined(__OPTIMIZE_SIZE__)
+
+ cmp divisor, #0
+ beq LSYM(Ldiv0)
+@@ -972,6 +1120,14 @@ LSYM(udivsi3_skip_div0_test):
+ pop { work }
+ RET
+
++/* Implementation of aeabi_uidiv for ARMv6m. This version is only
++ used in ARMv6-M when we need an efficient implementation. */
++#else
++LSYM(udivsi3_skip_div0_test):
++ THUMB1_Div_Positive
++
++#endif /* __OPTIMIZE_SIZE__ */
++
+ #elif defined(__ARM_ARCH_EXT_IDIV__)
+
+ ARM_FUNC_START udivsi3
+@@ -1023,12 +1179,21 @@ LSYM(udivsi3_skip_div0_test):
+ FUNC_START aeabi_uidivmod
+ cmp r1, #0
+ beq LSYM(Ldiv0)
++# if defined(__OPTIMIZE_SIZE__)
+ push {r0, r1, lr}
+ bl LSYM(udivsi3_skip_div0_test)
+ POP {r1, r2, r3}
+ mul r2, r0
+ sub r1, r1, r2
+ bx r3
++# else
++ /* Both the quotient and remainder are calculated simultaneously
++ in THUMB1_Div_Positive. There is no need to calculate the
++ remainder again here. */
++ b LSYM(udivsi3_skip_div0_test)
++ RET
++# endif /* __OPTIMIZE_SIZE__ */
++
+ #elif defined(__ARM_ARCH_EXT_IDIV__)
+ ARM_FUNC_START aeabi_uidivmod
+ cmp r1, #0
+@@ -1054,7 +1219,7 @@ ARM_FUNC_START aeabi_uidivmod
/* ------------------------------------------------------------------------ */
#ifdef L_umodsi3
@@ -139554,7 +160825,92 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
ARM_FUNC_START umodsi3
-@@ -1240,7 +1245,7 @@ ARM_FUNC_START aeabi_idivmod
+@@ -1084,7 +1249,7 @@ LSYM(Lover10):
+ RET
+
+ #else /* ARM version. */
+-
++
+ FUNC_START umodsi3
+
+ subs r2, r1, #1 @ compare divisor with 1
+@@ -1109,8 +1274,9 @@ LSYM(Lover10):
+
+ #if defined(__prefer_thumb__)
+
+- FUNC_START divsi3
++ FUNC_START divsi3
+ FUNC_ALIAS aeabi_idiv divsi3
++#if defined(__OPTIMIZE_SIZE__)
+
+ cmp divisor, #0
+ beq LSYM(Ldiv0)
+@@ -1133,7 +1299,7 @@ LSYM(Lover11):
+ blo LSYM(Lgot_result)
+
+ THUMB_DIV_MOD_BODY 0
+-
++
+ mov r0, result
+ mov work, ip
+ cmp work, #0
+@@ -1143,6 +1309,22 @@ LSYM(Lover12):
+ pop { work }
+ RET
+
++/* Implementation of aeabi_idiv for ARMv6m. This version is only
++ used in ARMv6-M when we need an efficient implementation. */
++#else
++LSYM(divsi3_skip_div0_test):
++ cpy curbit, dividend
++ orr curbit, divisor
++ bmi LSYM(Lthumb1_div_negative)
++
++LSYM(Lthumb1_div_positive):
++ THUMB1_Div_Positive
++
++LSYM(Lthumb1_div_negative):
++ THUMB1_Div_Negative
++
++#endif /* __OPTIMIZE_SIZE__ */
++
+ #elif defined(__ARM_ARCH_EXT_IDIV__)
+
+ ARM_FUNC_START divsi3
+@@ -1154,8 +1336,8 @@ LSYM(Lover12):
+ RET
+
+ #else /* ARM/Thumb-2 version. */
+-
+- ARM_FUNC_START divsi3
++
++ ARM_FUNC_START divsi3
+ ARM_FUNC_ALIAS aeabi_idiv divsi3
+
+ cmp r1, #0
+@@ -1209,12 +1391,21 @@ LSYM(divsi3_skip_div0_test):
+ FUNC_START aeabi_idivmod
+ cmp r1, #0
+ beq LSYM(Ldiv0)
++# if defined(__OPTIMIZE_SIZE__)
+ push {r0, r1, lr}
+ bl LSYM(divsi3_skip_div0_test)
+ POP {r1, r2, r3}
+ mul r2, r0
+ sub r1, r1, r2
+ bx r3
++# else
++ /* Both the quotient and remainder are calculated simultaneously
++ in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
++ need to calculate the remainder again here. */
++ b LSYM(divsi3_skip_div0_test)
++ RET
++# endif /* __OPTIMIZE_SIZE__ */
++
+ #elif defined(__ARM_ARCH_EXT_IDIV__)
+ ARM_FUNC_START aeabi_idivmod
+ cmp r1, #0
+@@ -1240,7 +1431,7 @@ ARM_FUNC_START aeabi_idivmod
/* ------------------------------------------------------------------------ */
#ifdef L_modsi3
@@ -139563,7 +160919,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
ARM_FUNC_START modsi3
-@@ -1508,14 +1513,15 @@ LSYM(Lover12):
+@@ -1508,14 +1699,15 @@ LSYM(Lover12):
#endif /* __symbian__ */
@@ -139583,7 +160939,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
FUNC_START clzsi2
mov r1, #28
mov r3, #1
-@@ -1576,7 +1582,7 @@ ARM_FUNC_START clzsi2
+@@ -1576,7 +1768,7 @@ ARM_FUNC_START clzsi2
#ifdef L_clzdi2
#if !defined(HAVE_ARM_CLZ)
@@ -139592,7 +160948,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
FUNC_START clzdi2
push {r4, lr}
# else
-@@ -1601,7 +1607,7 @@ ARM_FUNC_START clzdi2
+@@ -1601,7 +1793,7 @@ ARM_FUNC_START clzdi2
bl __clzsi2
# endif
2:
@@ -139601,7 +160957,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
pop {r4, pc}
# else
RETLDM r4
-@@ -1623,7 +1629,7 @@ ARM_FUNC_START clzdi2
+@@ -1623,7 +1815,7 @@ ARM_FUNC_START clzdi2
#endif /* L_clzdi2 */
#ifdef L_ctzsi2
@@ -139610,7 +160966,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
FUNC_START ctzsi2
neg r1, r0
and r0, r0, r1
-@@ -1738,7 +1744,7 @@ ARM_FUNC_START ctzsi2
+@@ -1738,7 +1930,7 @@ ARM_FUNC_START ctzsi2
/* Don't bother with the old interworking routines for Thumb-2. */
/* ??? Maybe only omit these on "m" variants. */
@@ -139619,7 +160975,7 @@ LANG=C git diff --no-renames b4cb27f1fdef21fb547bd1a9c142c77a756a9922..d7ffbd29b
#if defined L_interwork_call_via_rX
-@@ -1983,11 +1989,12 @@ LSYM(Lchange_\register):
+@@ -1983,11 +2175,12 @@ LSYM(Lchange_\register):
.endm
#ifndef __symbian__
diff --git a/debian/patches/svn-updates.diff b/debian/patches/svn-updates.diff
index a0a3080..0267264 100644
--- a/debian/patches/svn-updates.diff
+++ b/debian/patches/svn-updates.diff
@@ -1,10 +1,10 @@
-# DP: updates from the 6 branch upto 20170205 (r245197).
+# DP: updates from the 6 branch upto 20170211 (r245353).
last_update()
{
cat > ${dir}LAST_UPDATED <EOF
-Sun Feb 5 21:14:30 CET 2017
-Sun Feb 5 20:14:30 UTC 2017 (revision 245197)
+Sat Feb 11 01:43:35 CET 2017
+Sat Feb 11 00:43:35 UTC 2017 (revision 245353)
EOF
}
@@ -1106,6 +1106,44 @@ Index: ChangeLog
2016-12-21 Release Manager
* GCC 6.3.0 released.
+Index: libatomic/ChangeLog
+===================================================================
+--- a/src/libatomic/ChangeLog (.../tags/gcc_6_3_0_release)
++++ b/src/libatomic/ChangeLog (.../branches/gcc-6-branch)
+@@ -1,3 +1,11 @@
++2017-02-07 Szabolcs Nagy <szabolcs.nagy at arm.com>
++
++ Backport from mainline:
++ 2017-01-30 Szabolcs Nagy <szabolcs.nagy at arm.com>
++
++ PR target/78945
++ * config/arm/exch_n.c (libat_exchange): Check __ARM_FEATURE_SIMD32.
++
+ 2016-12-21 Release Manager
+
+ * GCC 6.3.0 released.
+Index: libatomic/config/arm/exch_n.c
+===================================================================
+--- a/src/libatomic/config/arm/exch_n.c (.../tags/gcc_6_3_0_release)
++++ b/src/libatomic/config/arm/exch_n.c (.../branches/gcc-6-branch)
+@@ -29,7 +29,7 @@
+ /* When using STREX to implement sub-word exchange, we can do much better
+ than the compiler by using the APSR.GE and APSR.C flags. */
+
+-#if !DONE && HAVE_STREX && !HAVE_STREXBH && N == 2
++#if !DONE && __ARM_FEATURE_SIMD32 && HAVE_STREX && !HAVE_STREXBH && N == 2
+ UTYPE
+ SIZE(libat_exchange) (UTYPE *mptr, UTYPE newval, int smodel)
+ {
+@@ -79,7 +79,7 @@
+ #endif /* !HAVE_STREXBH && N == 2 */
+
+
+-#if !DONE && HAVE_STREX && !HAVE_STREXBH && N == 1
++#if !DONE && __ARM_FEATURE_SIMD32 && HAVE_STREX && !HAVE_STREXBH && N == 1
+ UTYPE
+ SIZE(libat_exchange) (UTYPE *mptr, UTYPE newval, int smodel)
+ {
Index: config/ax_check_define.m4
===================================================================
--- a/src/config/ax_check_define.m4 (.../tags/gcc_6_3_0_release)
@@ -1393,7 +1431,7 @@ Index: gcc/DATESTAMP
+++ b/src/gcc/DATESTAMP (.../branches/gcc-6-branch)
@@ -1 +1 @@
-20161221
-+20170205
++20170211
Index: gcc/postreload.c
===================================================================
--- a/src/gcc/postreload.c (.../tags/gcc_6_3_0_release)
@@ -1637,7 +1675,43 @@ Index: gcc/ChangeLog
===================================================================
--- a/src/gcc/ChangeLog (.../tags/gcc_6_3_0_release)
+++ b/src/gcc/ChangeLog (.../branches/gcc-6-branch)
-@@ -1,3 +1,424 @@
+@@ -1,3 +1,460 @@
++2017-02-08 Segher Boessenkool <segher at kernel.crashing.org>
++
++ PR translation/79397
++ * config/rs6000/rs6000.opt (maltivec=le, maltivec=be): Fix spelling
++ of AltiVec.
++
++2017-02-08 Richard Biener <rguenther at suse.de>
++
++ Backport from mainline
++ 2017-02-08 Richard Biener <rguenther at suse.de>
++
++ PR tree-optimization/71824
++ * graphite-scop-detection.c (scop_detection::build_scop_breadth):
++ Check all loops contained in the merged region.
++
++ 2017-02-01 Richard Biener <rguenther at suse.de>
++
++ PR tree-optimization/71824
++ * graphite-scop-detection.c (scop_detection::build_scop_breadth):
++ Verify the loops are valid in the merged SESE region.
++ (scop_detection::can_represent_loop_1): Check analyzing the
++ evolution of the number of iterations in the region succeeds.
++
++ 2017-01-31 Richard Biener <rguenther at suse.de>
++
++ PR tree-optimization/77318
++ * graphite-sese-to-poly.c (extract_affine): Fix assert.
++ (create_pw_aff_from_tree): Take loop parameter.
++ (add_condition_to_pbb): Pass loop of the condition to
++ create_pw_aff_from_tree.
++
++2017-02-06 Dominique d'Humieres <dominiq at lps.ens.fr>
++
++ PR target/71017
++ * config/i386/cpuid.h: Fix undefined behavior.
++
+2017-02-03 Carl Love <cel at us.ibm.com>
+
+ Backport of two commits from mainline, r244943 and r244904,
@@ -3717,6 +3791,91 @@ Index: gcc/testsuite/gcc.dg/spellcheck-options-13.c
+/* { dg-do compile } */
+/* { dg-options "-fsanitize" } */
+/* { dg-error "unrecognized command line option .-fsanitize..$" "" { target *-*-* } 0 } */
+Index: gcc/testsuite/gcc.dg/graphite/pr71824-2.c
+===================================================================
+--- a/src/gcc/testsuite/gcc.dg/graphite/pr71824-2.c (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/testsuite/gcc.dg/graphite/pr71824-2.c (.../branches/gcc-6-branch)
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -floop-nest-optimize" } */
++
++typedef struct { float x1; } bx;
++typedef struct {
++ int w;
++ short o;
++} T2P;
++T2P a;
++int b;
++void fn2();
++void fn3(bx*,short);
++void fn1() {
++ unsigned i = 0;
++ int c;
++ bx *d;
++ bx **h;
++ if (b == 0) {
++ fn2();
++ return;
++ }
++ for (; c; c++)
++ for (; i < 100; i++) {
++ d = h[i];
++ d->x1 = a.w;
++ }
++ for (; i < 100; i++) {
++ d = h[i];
++ d->x1 = a.w;
++ }
++ if (a.o)
++ for (; b;)
++ fn3(d, a.o);
++}
+Index: gcc/testsuite/gcc.dg/graphite/pr71824-3.c
+===================================================================
+--- a/src/gcc/testsuite/gcc.dg/graphite/pr71824-3.c (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/testsuite/gcc.dg/graphite/pr71824-3.c (.../branches/gcc-6-branch)
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-loop-distribution -floop-nest-optimize" } */
++
++struct
++{
++ int bz;
++} od, ka[2];
++
++int fw;
++
++void
++pc (void)
++{
++ for (od.bz = 0; od.bz < 2; ++od.bz)
++ {
++ ++fw;
++ ka[0] = ka[1];
++ }
++}
+Index: gcc/testsuite/gcc.dg/graphite/pr71824.c
+===================================================================
+--- a/src/gcc/testsuite/gcc.dg/graphite/pr71824.c (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/testsuite/gcc.dg/graphite/pr71824.c (.../branches/gcc-6-branch)
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -floop-nest-optimize" } */
++
++int a, b, d;
++int **c;
++int fn1() {
++ while (a)
++ if (d) {
++ int e = -d;
++ for (; b < e; b++)
++ c[b] = &a;
++ } else {
++ for (; b; b++)
++ c[b] = &b;
++ d = 0;
++ }
++}
Index: gcc/testsuite/gcc.dg/lto/pr69188_0.c
===================================================================
--- a/src/gcc/testsuite/gcc.dg/lto/pr69188_0.c (.../tags/gcc_6_3_0_release)
@@ -3866,7 +4025,26 @@ Index: gcc/testsuite/ChangeLog
===================================================================
--- a/src/gcc/testsuite/ChangeLog (.../tags/gcc_6_3_0_release)
+++ b/src/gcc/testsuite/ChangeLog (.../branches/gcc-6-branch)
-@@ -1,3 +1,295 @@
+@@ -1,3 +1,314 @@
++2017-02-08 Richard Biener <rguenther at suse.de>
++
++ Backport from mainline
++ 2017-02-08 Richard Biener <rguenther at suse.de>
++
++ PR tree-optimization/71824
++ PR tree-optimization/79409
++ * gcc.dg/graphite/pr71824-3.c: New testcase.
++
++ 2017-02-08 Richard Biener <rguenther at suse.de>
++
++ PR tree-optimization/71824
++ * gcc.dg/graphite/pr71824-2.c: New testcase.
++
++ 2017-02-01 Richard Biener <rguenther at suse.de>
++
++ PR tree-optimization/71824
++ * gcc.dg/graphite/pr71824.c: New testcase.
++
+2017-02-03 Carl Love <cel at us.ibm.com>
+
+ * gcc.target/powerpc/builtins-3-p8.c: Add new testfile for missing
@@ -4515,6 +4693,22 @@ Index: gcc/testsuite/g++.dg/ipa/pr71207.C
+
+ return 0;
+}
+Index: gcc/testsuite/g++.dg/cpp1y/constexpr-union1.C
+===================================================================
+--- a/src/gcc/testsuite/g++.dg/cpp1y/constexpr-union1.C (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/testsuite/g++.dg/cpp1y/constexpr-union1.C (.../branches/gcc-6-branch)
+@@ -0,0 +1,11 @@
++// PR c++/78897
++// { dg-do compile { target c++14 } }
++
++struct Optional {
++ constexpr Optional() : _dummy{} { _value = 1; }
++ union {
++ int _dummy;
++ int _value;
++ };
++};
++Optional opt{};
Index: gcc/testsuite/g++.dg/vect/pr36648.cc
===================================================================
--- a/src/gcc/testsuite/g++.dg/vect/pr36648.cc (.../tags/gcc_6_3_0_release)
@@ -4571,7 +4765,19 @@ Index: gcc/cp/constexpr.c
===================================================================
--- a/src/gcc/cp/constexpr.c (.../tags/gcc_6_3_0_release)
+++ b/src/gcc/cp/constexpr.c (.../branches/gcc-6-branch)
-@@ -5295,6 +5295,7 @@
+@@ -3239,6 +3239,11 @@
+ tree fields = TYPE_FIELDS (DECL_CONTEXT (index));
+ unsigned HOST_WIDE_INT idx;
+
++ if (code == UNION_TYPE && CONSTRUCTOR_NELTS (*valp)
++ && CONSTRUCTOR_ELT (*valp, 0)->index != index)
++ /* Changing active member. */
++ vec_safe_truncate (CONSTRUCTOR_ELTS (*valp), 0);
++
+ for (idx = 0;
+ vec_safe_iterate (CONSTRUCTOR_ELTS (*valp), idx, &cep);
+ idx++, fields = DECL_CHAIN (fields))
+@@ -5295,6 +5300,7 @@
/* We can see these in statement-expressions. */
return true;
@@ -4602,7 +4808,13 @@ Index: gcc/cp/ChangeLog
===================================================================
--- a/src/gcc/cp/ChangeLog (.../tags/gcc_6_3_0_release)
+++ b/src/gcc/cp/ChangeLog (.../branches/gcc-6-branch)
-@@ -1,3 +1,47 @@
+@@ -1,3 +1,53 @@
++2017-02-10 Jason Merrill <jason at redhat.com>
++
++ PR c++/78897 - constexpr union
++ * constexpr.c (cxx_eval_store_expression): A store to a union member
++ erases a previous store to another member.
++
+2017-01-26 Jason Merrill <jason at redhat.com>
+
+ PR c++/79176 - lambda ICE with -flto -Os
@@ -5594,7 +5806,13 @@ Index: gcc/fortran/ChangeLog
===================================================================
--- a/src/gcc/fortran/ChangeLog (.../tags/gcc_6_3_0_release)
+++ b/src/gcc/fortran/ChangeLog (.../branches/gcc-6-branch)
-@@ -1,3 +1,42 @@
+@@ -1,3 +1,48 @@
++2017-02-07 Steven G. Kargl <kargl at gcc.gnu.org>
++
++ * trans-types.c (gfc_get_int_kind_from_width_isofortranen): Choose
++ REAL type with the widest precision if two (or more) have the same
++ storage size.
++
+2017-01-29 Andre Vehreschild <vehre at gcc.gnu.org>
+
+ Backported from trunk
@@ -5637,6 +5855,60 @@ Index: gcc/fortran/ChangeLog
2016-12-21 Release Manager
* GCC 6.3.0 released.
+Index: gcc/fortran/trans-types.c
+===================================================================
+--- a/src/gcc/fortran/trans-types.c (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/fortran/trans-types.c (.../branches/gcc-6-branch)
+@@ -234,27 +234,42 @@
+ return -1;
+ }
+
+-/* Get the kind number corresponding to a real of given storage size,
+- following the required return values for ISO_FORTRAN_ENV REAL* constants:
+- -2 is returned if we support a kind of larger size, -1 otherwise. */
++
++/* Get the kind number corresponding to a real of a given storage size.
++ If two real's have the same storage size, then choose the real with
++ the largest precision. If a kind type is unavailable and a real
++ exists with wider storage, then return -2; otherwise, return -1. */
++
+ int
+ gfc_get_real_kind_from_width_isofortranenv (int size)
+ {
+- int i;
++ int digits, i, kind;
+
+ size /= 8;
+
++ kind = -1;
++ digits = 0;
++
+ /* Look for a kind with matching storage size. */
+ for (i = 0; gfc_real_kinds[i].kind != 0; i++)
+ if (int_size_in_bytes (gfc_get_real_type (gfc_real_kinds[i].kind)) == size)
+- return gfc_real_kinds[i].kind;
++ {
++ if (gfc_real_kinds[i].digits > digits)
++ {
++ digits = gfc_real_kinds[i].digits;
++ kind = gfc_real_kinds[i].kind;
++ }
++ }
+
++ if (kind != -1)
++ return kind;
++
+ /* Look for a kind with larger storage size. */
+ for (i = 0; gfc_real_kinds[i].kind != 0; i++)
+ if (int_size_in_bytes (gfc_get_real_type (gfc_real_kinds[i].kind)) > size)
+- return -2;
++ kind = -2;
+
+- return -1;
++ return kind;
+ }
+
+
Index: gcc/fortran/resolve.c
===================================================================
--- a/src/gcc/fortran/resolve.c (.../tags/gcc_6_3_0_release)
@@ -5716,6 +5988,39 @@ Index: gcc/tree-data-ref.c
A[index][0] = mult * int_cst_value (CHREC_RIGHT (chrec));
return initialize_matrix_A (A, CHREC_LEFT (chrec), index + 1, mult);
+Index: gcc/graphite-scop-detection.c
+===================================================================
+--- a/src/gcc/graphite-scop-detection.c (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/graphite-scop-detection.c (.../branches/gcc-6-branch)
+@@ -905,7 +905,19 @@
+
+ sese_l combined = merge_sese (s1, s2);
+
++ /* Combining adjacent loops may add unrelated loops into the
++ region so we have to check all sub-loops of the outer loop
++ that are in the combined region. */
+ if (combined)
++ for (l = loop_outer (loop)->inner; l; l = l->next)
++ if (bb_in_sese_p (l->header, combined)
++ && ! loop_is_valid_in_scop (l, combined))
++ {
++ combined = invalid_sese;
++ break;
++ }
++
++ if (combined)
+ s1 = combined;
+ else
+ add_scop (s2);
+@@ -931,6 +943,8 @@
+ && niter_desc.control.no_overflow
+ && (niter = number_of_latch_executions (loop))
+ && !chrec_contains_undetermined (niter)
++ && !chrec_contains_undetermined (scalar_evolution_in_region (scop,
++ loop, niter))
+ && graphite_can_represent_expr (scop, loop, niter);
+ }
+
Index: gcc/ubsan.c
===================================================================
--- a/src/gcc/ubsan.c (.../tags/gcc_6_3_0_release)
@@ -47185,6 +47490,45 @@ Index: gcc/varasm.c
/* We cannot share RTX'es in pool entries.
Mark this piece of RTL as required for unsharing. */
RTX_FLAG (rtl, used) = 1;
+Index: gcc/graphite-sese-to-poly.c
+===================================================================
+--- a/src/gcc/graphite-sese-to-poly.c (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/graphite-sese-to-poly.c (.../branches/gcc-6-branch)
+@@ -407,7 +407,7 @@
+
+ case SSA_NAME:
+ gcc_assert (-1 != parameter_index_in_region_1 (e, s->scop_info)
+- || !invariant_in_sese_p_rec (e, s->scop_info->region, NULL));
++ || defined_in_sese_p (e, s->scop_info->region));
+ res = extract_affine_name (s, e, space);
+ break;
+
+@@ -436,11 +436,11 @@
+ /* Returns a linear expression for tree T evaluated in PBB. */
+
+ static isl_pw_aff *
+-create_pw_aff_from_tree (poly_bb_p pbb, tree t)
++create_pw_aff_from_tree (poly_bb_p pbb, loop_p loop, tree t)
+ {
+ scop_p scop = PBB_SCOP (pbb);
+
+- t = scalar_evolution_in_region (scop->scop_info->region, pbb_loop (pbb), t);
++ t = scalar_evolution_in_region (scop->scop_info->region, loop, t);
+
+ gcc_assert (!chrec_contains_undetermined (t));
+ gcc_assert (!automatically_generated_chrec_p (t));
+@@ -455,8 +455,9 @@
+ static void
+ add_condition_to_pbb (poly_bb_p pbb, gcond *stmt, enum tree_code code)
+ {
+- isl_pw_aff *lhs = create_pw_aff_from_tree (pbb, gimple_cond_lhs (stmt));
+- isl_pw_aff *rhs = create_pw_aff_from_tree (pbb, gimple_cond_rhs (stmt));
++ loop_p loop = gimple_bb (stmt)->loop_father;
++ isl_pw_aff *lhs = create_pw_aff_from_tree (pbb, loop, gimple_cond_lhs (stmt));
++ isl_pw_aff *rhs = create_pw_aff_from_tree (pbb, loop, gimple_cond_rhs (stmt));
+
+ isl_set *cond;
+ switch (code)
Index: gcc/tree-profile.c
===================================================================
--- a/src/gcc/tree-profile.c (.../tags/gcc_6_3_0_release)
@@ -47317,6 +47661,19 @@ Index: gcc/config/i386/rtemself.h
-#define LONG_DOUBLE_TYPE_SIZE (TARGET_80387 ? 80 : 64)
-
#define IX86_NO_LIBGCC_TFMODE
+Index: gcc/config/i386/cpuid.h
+===================================================================
+--- a/src/gcc/config/i386/cpuid.h (.../tags/gcc_6_3_0_release)
++++ b/src/gcc/config/i386/cpuid.h (.../branches/gcc-6-branch)
+@@ -89,7 +89,7 @@
+ #define bit_AVX512CD (1 << 28)
+ #define bit_SHA (1 << 29)
+ #define bit_AVX512BW (1 << 30)
+-#define bit_AVX512VL (1 << 31)
++#define bit_AVX512VL (1u << 31)
+
+ /* %ecx */
+ #define bit_PREFETCHWT1 (1 << 0)
Index: gcc/config/i386/i386.c
===================================================================
--- a/src/gcc/config/i386/i386.c (.../tags/gcc_6_3_0_release)
@@ -47698,6 +48055,20 @@ Index: gcc/config/rs6000/rs6000.opt
===================================================================
--- a/src/gcc/config/rs6000/rs6000.opt (.../tags/gcc_6_3_0_release)
+++ b/src/gcc/config/rs6000/rs6000.opt (.../branches/gcc-6-branch)
+@@ -142,11 +142,11 @@
+
+ maltivec=le
+ Target Report RejectNegative Var(rs6000_altivec_element_order, 1) Save
+-Generate Altivec instructions using little-endian element order.
++Generate AltiVec instructions using little-endian element order.
+
+ maltivec=be
+ Target Report RejectNegative Var(rs6000_altivec_element_order, 2)
+-Generate Altivec instructions using big-endian element order.
++Generate AltiVec instructions using big-endian element order.
+
+ mhard-dfp
+ Target Report Mask(DFP) Var(rs6000_isa_flags)
@@ -602,7 +602,7 @@
Analyze and remove doubleword swaps from VSX computations.
diff --git a/debian/rules.patch b/debian/rules.patch
index 68205c6..eeca2f6 100644
--- a/debian/rules.patch
+++ b/debian/rules.patch
@@ -15,8 +15,6 @@ series_file ?= $(patchdir)/series
debian_patches = \
svn-updates \
libiberty-updates \
- $(if $(with_linaro_branch),gcc-linaro-r244724-revert) \
- $(if $(with_linaro_branch),gcc-linaro-r244242-revert) \
$(if $(with_linaro_branch),gcc-linaro) \
$(if $(with_linaro_branch),gcc-linaro-no-macros) \
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/gcc-6.git
More information about the Reproducible-commits
mailing list